Source code for simdeep.survival_utils

"""
"""
import re

import pandas as pd

from simdeep.config import PATH_DATA
from simdeep.config import SURVIVAL_FLAG
from simdeep.config import SEPARATOR
from simdeep.config import ENTREZ_TO_ENSG_FILE
from simdeep.config import USE_INPUT_TRANSPOSE
from simdeep.config import DEFAULTSEP
from simdeep.config import CLASSIFIER

import  numpy as np

from os.path import isfile

from scipy.stats import rankdata

from numpy import hstack

from sklearn.metrics import pairwise_distances

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import RobustScaler

from collections import defaultdict

from simdeep.coxph_from_r import coxph
from simdeep.coxph_from_r import c_index

from scipy.stats import kruskal
from scipy.stats import ranksums

from os.path import isdir
from os import mkdir


################ DEBUG ################
# supposed to be None for normal usage
MAX_FEATURE = None
#######################################


[docs]class MadScaler(): def __init__(self): """ """ pass
[docs] def fit_transform(self, X): """ """ X = np.asarray(X) for i in range(len(X)): med = np.median(X[i]) mad = np.median(np.abs(X[i] - med)) X[i] = (X[i] - med) / mad return np.nan_to_num(np.matrix(X))
[docs]class RankNorm(): """ """ def __init__(self): """ """ pass
[docs] def fit_transform(self, X): """ """ X = np.asarray(X) shape = list(map(float, X.shape)) for i in range(len(X)): X[i] = rankdata(X[i]) / shape[1] return np.matrix(X)
[docs]class SampleReducer(): """ """ def __init__(self, perc_sample_to_keep=0.90): """ """ assert(isinstance(perc_sample_to_keep, float)) assert(0.0 < perc_sample_to_keep < 1.0) self.perc_sample_to_keep = perc_sample_to_keep
[docs] def sample_to_keep(self, datasets, index=None): """ """ nb_samples = len(datasets.values()[0][index]) scores = np.zeros(nb_samples) threshold = int(nb_samples * self.perc_sample_to_keep) for key in datasets: scores_array = np.array([vector.sum() for vector in datasets[key][index]]) scores = scores + scores_array scores = [(pos, score) for pos, score in enumerate(scores)] scores.sort(key=lambda x:x[1], reverse=True) to_keep = [pos for pos, score in scores[:threshold]] to_remove = [pos for pos, score in scores[threshold:]] return to_keep, to_remove
[docs]class VarianceReducer(): """ """ def __init__(self, nb_features=200): """ """ self.nb_features = nb_features self.index_to_keep = []
[docs] def fit(self, dataset): """ """ if self.nb_features > dataset.shape[1]: self.nb_features = dataset.shape[1] variances = [np.var(array) for array in dataset.T] threshold = sorted(enumerate(variances), reverse=True, key=lambda x:x[1], ) self.index_to_keep = [pos for pos, var in threshold[:self.nb_features]]
[docs] def transform(self, dataset): """ """ return dataset.T[self.index_to_keep].T
[docs] def fit_transform(self, dataset): """ """ self.fit(dataset) return self.transform(dataset)
[docs]class CorrelationReducer(): """ """ def __init__(self, distance='correlation', threshold=None): """ """ self.distance = distance self.dataset = None self.threshold = threshold
[docs] def fit(self, dataset): """ """ self.dataset = dataset if self.threshold: self.dataset[self.dataset < self.threshold] = 0
[docs] def transform(self, dataset): """ """ if self.threshold: dataset[dataset < self.threshold] = 0 return 1.0 - pairwise_distances(dataset, self.dataset, self.distance)
[docs] def fit_transform(self, dataset): """ """ self.fit(dataset) return self.transform(dataset)
[docs]class RankCorrNorm(): """ """ def __init__(self, dataset): """ """ self.dataset = dataset
[docs]def load_survival_file(f_name, path_data=PATH_DATA, sep=DEFAULTSEP, survival_flag=SURVIVAL_FLAG): """ """ if f_name in SEPARATOR: sep = SEPARATOR[f_name] survival = {} filename = '{0}/{1}'.format(path_data, f_name) if not isfile(filename): raise Exception('## Error wirh unexisting file: {0}'.format(filename)) with open(filename, 'r') as f_surv: first_line = f_surv.readline().strip(' \n\r\t').split(sep) for field in survival_flag.values(): try: assert(field in first_line) except Exception: raise Exception("""#### Exception with survival file {fil} header (first line): "{header}". Header does not contain the field "{field}". Please define new survival flags using the `survival_flag` variable. Current non-valid survival_flag: {sflag}. Needs variables in header: `{header}` defined as values for the key: "patient_id" => ID of the patient, "survival" => time of the study, "event" => event after survival time """.format( header=first_line, field=field, fil=filename, sflag=survival_flag, )) patient_id = first_line.index(survival_flag['patient_id']) surv_id = first_line.index(survival_flag['survival']) event_id = first_line.index(survival_flag['event']) for line in f_surv: line = line.strip('\n').split(sep) ids = line[patient_id].strip('"') ndays = line[surv_id].strip('"') isdead = line[event_id].strip('"') survival[ids] = (float(ndays), float(isdead)) return survival
[docs]def translate_index(original_ids, new_ids): """ """ index1d = {ids: pos for pos, ids in enumerate(original_ids)} return np.asarray([index1d[sample] for sample in new_ids])
[docs]def return_intersection_indexes(ids_1, ids_2): """ """ index1d = {ids: pos for pos, ids in enumerate(ids_1)} index2d = {ids: pos for pos, ids in enumerate(ids_2)} inter = set(ids_1).intersection(ids_2) if len(inter) == 0: raise(Exception("Error! No common sample index between: {0}... and {1}...".format( ids_1[:2], ids_2[:2]))) index1 = np.asarray([index1d[sample] for sample in inter]) index2 = np.asarray([index2d[sample] for sample in inter]) return index1, index2, list(inter)
[docs]def convert_metadata_frame_to_matrix(frame): """ """ lbl = LabelBinarizer() normed_matrix = np.zeros((frame.shape[0], 0)) keys = [] for key in frame.keys(): if str(frame[key].dtype) == 'object' or str(frame[key].dtype) == 'string': matrix = lbl.fit_transform(frame[key].astype('string')) if lbl.y_type_ == "binary": keys += list(["{0}_{1}".format(key, lbl.classes_[lbl.pos_label])]) else: keys += ["{0}_{1}".format(key, k) for k in lbl.classes_] else: rbs = RobustScaler() matrix = np.asarray(frame[key]).reshape((frame.shape[0], 1)) matrix = rbs.fit_transform(matrix) keys.append(key) normed_matrix = hstack([normed_matrix, matrix]) return pd.DataFrame(normed_matrix, columns=keys)
[docs]def load_data_from_tsv(use_transpose=USE_INPUT_TRANSPOSE, **kwargs): """ """ if use_transpose: return _load_data_from_tsv_transposee(**kwargs) else: return _load_data_from_tsv(**kwargs)
def _load_data_from_tsv( f_name, key, path_data=PATH_DATA, f_type=float, sep=DEFAULTSEP, nan_to_num=True): """ """ f_short = key if f_name in SEPARATOR: sep = SEPARATOR[f_name] f_tsv = open("{0}/{1}".format(path_data, f_name)) header = f_tsv.readline().strip(sep + '\n').split(sep) feature_ids = ['{0}_{1}'.format(f_short, head) for head in header][:MAX_FEATURE] sample_ids = [] f_matrix = [] for line in f_tsv: line = line.strip(sep + '\n').split(sep) sample_ids.append(line[0]) if nan_to_num: line[1:] = [0 if (l.isalpha() or not l) else l for l in line[1:MAX_FEATURE]] f_matrix.append(list(map(f_type, line[1:MAX_FEATURE]))) f_matrix = np.array(f_matrix) if f_matrix.shape[1] == len(feature_ids) - 1: feature_ids = feature_ids[1:] assert(f_matrix.shape[1] == len(feature_ids)) assert(f_matrix.shape[0] == len(sample_ids)) f_tsv.close() return sample_ids, feature_ids, f_matrix def _format_sample_name(sample_ids): """ """ regex = re.compile('_1_[A-Z][A-Z]') sample_ids = [regex.sub('', sample.strip('"')) for sample in sample_ids] return sample_ids def _load_data_from_tsv_transposee( f_name, key, path_data=PATH_DATA, f_type=float, sep=DEFAULTSEP, nan_to_num=True): """ """ if f_name in SEPARATOR: sep = SEPARATOR[f_name] f_tsv = open(path_data + f_name) header = f_tsv.readline().strip(sep + '\n').split(sep) sample_ids = header[1:] sample_ids = _format_sample_name(sample_ids) feature_ids = [] f_matrix = [] if f_name.lower().count('entrez'): ensg_dict = load_entrezID_to_ensg() use_ensg = True else: use_ensg = False for line in f_tsv: line = line.strip(sep + '\n').split(sep) feature = line[0].strip('"') if nan_to_num: line[1:] = [0 if (l.isalpha() or not l) else l for l in line[1:]] if use_ensg and feature in ensg_dict: features = ensg_dict[feature] else: features = [feature] for feature in features: feature_ids.append('{0}_{1}'.format(key, feature)) f_matrix.append(list(map(f_type, line[1:]))) f_matrix = np.array(f_matrix).T assert(f_matrix.shape[1] == len(feature_ids)) assert(f_matrix.shape[0] == len(sample_ids)) f_tsv.close() return sample_ids, feature_ids, f_matrix
[docs]def select_best_classif_params(clf): """ select best classifier parameters based uniquely on test errors """ arr = [] for fold in range(clf.cv): arr.append(clf.cv_results_[ 'split{0}_test_score'.format(fold)]) score = [ar.max() for ar in np.array(arr).T] index = score.index(max(score)) params = clf.cv_results_['params'][index] clf = CLASSIFIER(**params) return clf, params
[docs]def load_entrezID_to_ensg(): """ """ entrez_dict = {} for line in open(ENTREZ_TO_ENSG_FILE): line = line.split() entrez_dict[line[0]] = line[1:] return entrez_dict
def _process_parallel_coxph(inp): """ """ node_id, activity, isdead, nbdays, seed, metadata_mat, use_r_packages = inp pvalue = coxph(activity, isdead, nbdays, seed=seed, metadata_mat=metadata_mat, use_r_packages=use_r_packages) return node_id, pvalue def _process_parallel_cindex(inp): """ """ (node_id, act_ref, isdead_ref, nbdays_ref, act_test, isdead_test, nbdays_test, use_r_packages) = inp score = c_index(act_ref, isdead_ref, nbdays_ref, act_test, isdead_test, nbdays_test, use_r_packages=use_r_packages ) return node_id, score def _process_parallel_feature_importance(inp): """ """ arrays = defaultdict(list) feature, array, labels = inp for label, value in zip(labels, np.array(array).reshape(-1)): arrays[label].append(value) try: score, pvalue = kruskal(*arrays.values()) except Exception: return feature, 1.0 return feature, pvalue def _process_parallel_feature_importance_per_cluster(inp): """ """ arrays = defaultdict(list) results = [] feature, array, labels, pval_thres = inp for label, value in zip(labels, np.array(array).reshape(-1)): arrays[label].append(value) for cluster in arrays: array = np.array(arrays[cluster]) array_comp = np.array([a for comp in arrays for a in arrays[comp] if comp != cluster]) score, pvalue = ranksums(array, array_comp) median_diff = np.median(array) - np.median(array_comp) if pvalue < pval_thres: results.append((cluster, feature, median_diff, pvalue)) return results def _process_parallel_survival_feature_importance_per_cluster(inp): """ """ feature, array, survival, metadata_mat, pval_thres, use_r_packages = inp nbdays, isdead = survival.T.tolist() pvalue = coxph( array, isdead, nbdays, metadata_mat=metadata_mat, use_r_packages=use_r_packages ) if not np.isnan(pvalue) and pvalue < pval_thres: return (feature, pvalue) return None, None
[docs]def save_matrix(matrix, feature_array, sample_array, path_folder, project_name, key='', sep='\t'): """ """ if not isdir(path_folder): mkdir(path_folder) if key: key = '_' + key f_csv = open('{0}/{1}{2}.tsv'.format(path_folder, project_name, key), 'w') f_csv.write(sep + sep.join(map(lambda x:x.split('_', 1)[-1], feature_array)) + '\n') for sample, vector in zip(sample_array, matrix): vector = np.asarray(vector).reshape(-1) f_csv.write('{0}{1}'.format(sample, sep) + sep.join(map(str, vector)) + '\n') print('{0}/{1}{2}.tsv saved'.format(path_folder, project_name, key))