Source code for simdeep.survival_utils

"""
"""
import re

import pandas as pd

from simdeep.config import PATH_DATA
from simdeep.config import SURVIVAL_FLAG
from simdeep.config import SEPARATOR
from simdeep.config import ENTREZ_TO_ENSG_FILE
from simdeep.config import USE_INPUT_TRANSPOSE
from simdeep.config import DEFAULTSEP
from simdeep.config import CLASSIFIER

import  numpy as np

from os.path import isfile

from scipy.stats import rankdata

from numpy import hstack

from sklearn.metrics import pairwise_distances

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import RobustScaler

from collections import defaultdict

from simdeep.coxph_from_r import coxph
from simdeep.coxph_from_r import c_index

from scipy.stats import kruskal
from scipy.stats import ranksums

from os.path import isdir
from os import mkdir


################ DEBUG ################
# supposed to be None for normal usage
MAX_FEATURE = None
#######################################


[docs]class MadScaler():
    def __init__(self):
        """
        """
        pass
[docs]    def fit_transform(self, X):
        """ """
        X = np.asarray(X)

        for i in range(len(X)):
            med = np.median(X[i])
            mad = np.median(np.abs(X[i] - med))
            X[i] = (X[i] - med) / mad

        return np.nan_to_num(np.matrix(X))

[docs]class RankNorm():
    """
    """
    def __init__(self):
        """
        """
        pass

[docs]    def fit_transform(self, X):
        """ """
        X = np.asarray(X)
        shape = list(map(float, X.shape))

        for i in range(len(X)):
            X[i] = rankdata(X[i]) / shape[1]

        return np.matrix(X)

[docs]class SampleReducer():
    """
    """
    def __init__(self, perc_sample_to_keep=0.90):
        """
        """
        assert(isinstance(perc_sample_to_keep, float))
        assert(0.0 < perc_sample_to_keep < 1.0)
        self.perc_sample_to_keep = perc_sample_to_keep

[docs]    def sample_to_keep(self, datasets, index=None):
        """
        """
        nb_samples = len(datasets.values()[0][index])
        scores = np.zeros(nb_samples)
        threshold = int(nb_samples * self.perc_sample_to_keep)

        for key in datasets:
            scores_array = np.array([vector.sum() for vector in datasets[key][index]])
            scores = scores + scores_array

        scores = [(pos, score) for pos, score in enumerate(scores)]

        scores.sort(key=lambda x:x[1], reverse=True)
        to_keep = [pos for pos, score in scores[:threshold]]
        to_remove = [pos for pos, score in scores[threshold:]]

        return to_keep, to_remove

[docs]class VarianceReducer():
    """
    """
    def __init__(self, nb_features=200):
        """
        """
        self.nb_features = nb_features
        self.index_to_keep = []

[docs]    def fit(self, dataset):
        """
        """
        if self.nb_features > dataset.shape[1]:
            self.nb_features = dataset.shape[1]

        variances = [np.var(array) for array in dataset.T]
        threshold = sorted(enumerate(variances),
                           reverse=True,
                           key=lambda x:x[1],
        )
        self.index_to_keep = [pos for pos, var in threshold[:self.nb_features]]

[docs]    def transform(self, dataset):
        """
        """
        return dataset.T[self.index_to_keep].T

[docs]    def fit_transform(self, dataset):
        """
        """
        self.fit(dataset)
        return self.transform(dataset)

[docs]class CorrelationReducer():
    """
    """
    def __init__(self, distance='correlation', threshold=None):
        """
        """
        self.distance = distance
        self.dataset = None
        self.threshold = threshold

[docs]    def fit(self, dataset):
        """ """
        self.dataset = dataset

        if self.threshold:
            self.dataset[self.dataset < self.threshold] = 0

[docs]    def transform(self, dataset):
        """ """
        if self.threshold:
            dataset[dataset < self.threshold] = 0

        return 1.0 - pairwise_distances(dataset,
                                        self.dataset,
                                        self.distance)

[docs]    def fit_transform(self, dataset):
        """ """
        self.fit(dataset)
        return self.transform(dataset)


[docs]class RankCorrNorm():
    """
    """
    def __init__(self, dataset):
        """
        """
        self.dataset = dataset


[docs]def load_survival_file(f_name,
                       path_data=PATH_DATA,
                       sep=DEFAULTSEP,
                       survival_flag=SURVIVAL_FLAG):
    """ """
    if f_name in SEPARATOR:
        sep = SEPARATOR[f_name]

    survival = {}

    filename = '{0}/{1}'.format(path_data, f_name)

    if not isfile(filename):
        raise Exception('## Error wirh unexisting file: {0}'.format(filename))

    with open(filename, 'r') as f_surv:
        first_line = f_surv.readline().strip(' \n\r\t').split(sep)
        for field in survival_flag.values():
            try:
                assert(field in first_line)
            except Exception:
                raise Exception("""#### Exception with survival file {fil} header (first line): "{header}". Header does not contain the field "{field}". Please define new survival flags  using the `survival_flag` variable. Current non-valid survival_flag: {sflag}. Needs variables in header: `{header}` defined as values for the key: "patient_id" => ID of the patient, "survival" => time of the study, "event" => event after survival time """.format(
                    header=first_line,
                    field=field,
                    fil=filename,
                    sflag=survival_flag,
                    ))

        patient_id = first_line.index(survival_flag['patient_id'])
        surv_id = first_line.index(survival_flag['survival'])
        event_id = first_line.index(survival_flag['event'])

        for line in f_surv:
            line = line.strip('\n').split(sep)
            ids  = line[patient_id].strip('"')
            ndays = line[surv_id].strip('"')
            isdead = line[event_id].strip('"')

            survival[ids] = (float(ndays), float(isdead))

    return survival


[docs]def translate_index(original_ids, new_ids):
    """ """
    index1d = {ids: pos for pos, ids in enumerate(original_ids)}

    return np.asarray([index1d[sample] for sample in new_ids])


[docs]def return_intersection_indexes(ids_1, ids_2):
    """ """
    index1d = {ids: pos for pos, ids in enumerate(ids_1)}
    index2d = {ids: pos for pos, ids in enumerate(ids_2)}

    inter = set(ids_1).intersection(ids_2)

    if len(inter) == 0:
        raise(Exception("Error! No common sample index between: {0}... and {1}...".format(
            ids_1[:2], ids_2[:2])))

    index1 = np.asarray([index1d[sample] for sample in inter])
    index2 = np.asarray([index2d[sample] for sample in inter])

    return index1, index2, list(inter)


[docs]def convert_metadata_frame_to_matrix(frame):
    """ """
    lbl = LabelBinarizer()

    normed_matrix = np.zeros((frame.shape[0], 0))
    keys = []

    for key in frame.keys():
        if str(frame[key].dtype) == 'object' or str(frame[key].dtype) == 'string':
            matrix = lbl.fit_transform(frame[key].astype('string'))
            if lbl.y_type_ == "binary":
                keys += list(["{0}_{1}".format(key, lbl.classes_[lbl.pos_label])])
            else:
                keys += ["{0}_{1}".format(key, k) for k in lbl.classes_]
        else:
            rbs = RobustScaler()
            matrix = np.asarray(frame[key]).reshape((frame.shape[0], 1))
            matrix = rbs.fit_transform(matrix)
            keys.append(key)

        normed_matrix = hstack([normed_matrix, matrix])

    return pd.DataFrame(normed_matrix, columns=keys)


[docs]def load_data_from_tsv(use_transpose=USE_INPUT_TRANSPOSE, **kwargs):
    """
    """
    if use_transpose:
        return _load_data_from_tsv_transposee(**kwargs)
    else:
        return _load_data_from_tsv(**kwargs)


def _load_data_from_tsv(
        f_name,
        key,
        path_data=PATH_DATA,
        f_type=float,
        sep=DEFAULTSEP,
        nan_to_num=True):
    """ """
    f_short = key

    if f_name in SEPARATOR:
        sep = SEPARATOR[f_name]

    f_tsv = open("{0}/{1}".format(path_data, f_name))
    header = f_tsv.readline().strip(sep + '\n').split(sep)

    feature_ids = ['{0}_{1}'.format(f_short, head)
                   for head in header][:MAX_FEATURE]
    sample_ids = []
    f_matrix = []

    for line in f_tsv:
        line = line.strip(sep + '\n').split(sep)
        sample_ids.append(line[0])

        if nan_to_num:
            line[1:] = [0 if (l.isalpha() or not l) else l
                        for l in line[1:MAX_FEATURE]]

        f_matrix.append(list(map(f_type, line[1:MAX_FEATURE])))

    f_matrix = np.array(f_matrix)

    if f_matrix.shape[1] == len(feature_ids) - 1:
        feature_ids = feature_ids[1:]

    assert(f_matrix.shape[1] == len(feature_ids))
    assert(f_matrix.shape[0] == len(sample_ids))

    f_tsv.close()

    return sample_ids, feature_ids, f_matrix

def _format_sample_name(sample_ids):
    """
    """
    regex = re.compile('_1_[A-Z][A-Z]')

    sample_ids = [regex.sub('', sample.strip('"')) for sample in sample_ids]
    return sample_ids

def _load_data_from_tsv_transposee(
        f_name,
        key,
        path_data=PATH_DATA,
        f_type=float,
        sep=DEFAULTSEP,
        nan_to_num=True):
    """ """
    if f_name in SEPARATOR:
        sep = SEPARATOR[f_name]

    f_tsv = open(path_data + f_name)
    header = f_tsv.readline().strip(sep + '\n').split(sep)

    sample_ids = header[1:]

    sample_ids = _format_sample_name(sample_ids)

    feature_ids = []
    f_matrix = []

    if f_name.lower().count('entrez'):
        ensg_dict = load_entrezID_to_ensg()
        use_ensg = True
    else:
        use_ensg = False

    for line in f_tsv:
        line = line.strip(sep + '\n').split(sep)
        feature = line[0].strip('"')

        if nan_to_num:
            line[1:] = [0 if (l.isalpha() or not l) else l
                        for l in line[1:]]

        if use_ensg and feature in ensg_dict:
            features = ensg_dict[feature]
        else:
            features = [feature]

        for feature in features:
            feature_ids.append('{0}_{1}'.format(key, feature))
            f_matrix.append(list(map(f_type, line[1:])))


    f_matrix = np.array(f_matrix).T

    assert(f_matrix.shape[1] == len(feature_ids))
    assert(f_matrix.shape[0] == len(sample_ids))

    f_tsv.close()

    return sample_ids, feature_ids, f_matrix

[docs]def select_best_classif_params(clf):
    """
    select best classifier parameters based uniquely
    on test errors
    """
    arr = []

    for fold in range(clf.cv):
        arr.append(clf.cv_results_[
            'split{0}_test_score'.format(fold)])

    score = [ar.max() for ar in np.array(arr).T]
    index = score.index(max(score))

    params = clf.cv_results_['params'][index]

    clf = CLASSIFIER(**params)

    return clf, params

[docs]def load_entrezID_to_ensg():
    """
    """
    entrez_dict = {}

    for line in open(ENTREZ_TO_ENSG_FILE):
        line = line.split()
        entrez_dict[line[0]] = line[1:]

    return entrez_dict

def _process_parallel_coxph(inp):
    """
    """
    node_id, activity, isdead, nbdays, seed, metadata_mat, use_r_packages = inp
    pvalue = coxph(activity,
                   isdead,
                   nbdays,
                   seed=seed,
                   metadata_mat=metadata_mat,
                   use_r_packages=use_r_packages)

    return node_id, pvalue

def _process_parallel_cindex(inp):
    """
    """
    (node_id,
     act_ref, isdead_ref, nbdays_ref,
     act_test, isdead_test, nbdays_test, use_r_packages) = inp

    score = c_index(act_ref, isdead_ref, nbdays_ref,
                    act_test, isdead_test, nbdays_test,
                    use_r_packages=use_r_packages
                    )

    return node_id, score

def _process_parallel_feature_importance(inp):
    """
    """
    arrays = defaultdict(list)
    feature, array, labels = inp

    for label, value in zip(labels, np.array(array).reshape(-1)):
        arrays[label].append(value)
    try:
        score, pvalue = kruskal(*arrays.values())
    except Exception:
        return feature, 1.0

    return feature, pvalue

def _process_parallel_feature_importance_per_cluster(inp):
    """
    """
    arrays = defaultdict(list)
    results = []

    feature, array, labels, pval_thres = inp

    for label, value in zip(labels, np.array(array).reshape(-1)):
        arrays[label].append(value)

    for cluster in arrays:
        array = np.array(arrays[cluster])
        array_comp = np.array([a for comp in arrays for a in arrays[comp]
                      if comp != cluster])

        score, pvalue = ranksums(array, array_comp)
        median_diff = np.median(array) - np.median(array_comp)

        if pvalue < pval_thres:
            results.append((cluster, feature, median_diff, pvalue))

    return results


def _process_parallel_survival_feature_importance_per_cluster(inp):
    """
    """

    feature, array, survival, metadata_mat, pval_thres, use_r_packages = inp
    nbdays, isdead = survival.T.tolist()

    pvalue = coxph(
        array,
        isdead,
        nbdays,
        metadata_mat=metadata_mat,
        use_r_packages=use_r_packages
    )

    if not np.isnan(pvalue) and pvalue < pval_thres:
        return (feature, pvalue)
    return None, None

[docs]def save_matrix(matrix, feature_array, sample_array,
                path_folder, project_name, key='', sep='\t'):
    """
    """
    if not isdir(path_folder):
        mkdir(path_folder)

    if key:
        key = '_' + key

    f_csv = open('{0}/{1}{2}.tsv'.format(path_folder, project_name, key), 'w')

    f_csv.write(sep + sep.join(map(lambda x:x.split('_', 1)[-1], feature_array)) + '\n')

    for sample, vector in zip(sample_array, matrix):
        vector = np.asarray(vector).reshape(-1)
        f_csv.write('{0}{1}'.format(sample, sep) + sep.join(map(str, vector)) + '\n')

    print('{0}/{1}{2}.tsv saved'.format(path_folder, project_name, key))