simdeep package

Submodules

simdeep.config module

simdeep.coxph_from_r module

simdeep.coxph_from_r.c_index(values, isdead, nbdays, values_test, isdead_test, nbdays_test, isfactor=False, use_r_packages=False, seed=None)[source]
simdeep.coxph_from_r.c_index_from_python(values, isdead, nbdays, values_test, isdead_test, nbdays_test, isfactor=False)[source]
simdeep.coxph_from_r.c_index_from_r(values, isdead, nbdays, values_test, isdead_test, nbdays_test, isfactor=False)[source]
simdeep.coxph_from_r.c_index_multiple(values, isdead, nbdays, values_test, isdead_test, nbdays_test, isfactor=False, use_r_packages=False, seed=None)[source]
simdeep.coxph_from_r.c_index_multiple_from_python(matrix, isdead, nbdays, matrix_test, isdead_test, nbdays_test, isfactor=False)[source]
simdeep.coxph_from_r.c_index_multiple_from_r(matrix, isdead, nbdays, matrix_test, isdead_test, nbdays_test, lambda_val=None, isfactor=False)[source]
simdeep.coxph_from_r.convert_to_rmatrix(data)[source]
simdeep.coxph_from_r.coxph(values, isdead, nbdays, do_KM_plot=False, metadata_mat=None, png_path='./', dichotomize_afterward=False, fig_name='KM_plot.png', isfactor=False, use_r_packages=False, seed=None)[source]
simdeep.coxph_from_r.coxph_from_python(values, isdead, nbdays, do_KM_plot=False, png_path='./', metadata_mat=None, dichotomize_afterward=False, fig_name='KM_plot.pdf', penalizer=0.01, l1_ratio=0.0, isfactor=False)[source]
simdeep.coxph_from_r.coxph_from_r(values, isdead, nbdays, do_KM_plot=False, metadata_mat=None, png_path='./', dichotomize_afterward=False, fig_name='KM_plot.png', isfactor=False)[source]
input:
values:

array values of activities

isdead:

array <binary> Event occured int boolean: 0/1

nbdays:

array <int>

return:

pvalues from wald test

simdeep.coxph_from_r.main()[source]

DEBUG

simdeep.coxph_from_r.predict_with_coxph_glmnet(matrix, isdead, nbdays, matrix_test, alpha=0.5, lambda_val=None)[source]
simdeep.coxph_from_r.surv_mean(isdead, nbdays, use_r_packages=False)[source]
simdeep.coxph_from_r.surv_mean_from_python(isdead, nbdays)[source]
simdeep.coxph_from_r.surv_mean_from_r(isdead, nbdays)[source]
simdeep.coxph_from_r.surv_median(isdead, nbdays, use_r_packages=False)[source]
simdeep.coxph_from_r.surv_median_from_python(isdead, nbdays)[source]
simdeep.coxph_from_r.surv_median_from_r(isdead, nbdays)[source]

simdeep.deepmodel_base module

class simdeep.deepmodel_base.DeepBase(dataset=None, verbose=True, epochs=10, level_dims_in=(), level_dims_out=(), new_dim=100, loss='binary_crossentropy', optimizer='adam', act_reg=False, w_reg=False, dropout=0.5, data_split=None, activation='tanh', seed=2020, alternative_embedding=None, kwargs_alternative_embedding={}, path_to_save_model='./')[source]

Bases: object

compile_models()[source]

define the optimizer and the loss function compile the model and ready to fit the data!

construct_autoencoders()[source]

main class to create the autoencoder

construct_supervized_network(objective)[source]

main class to create the autoencoder

create_autoencoders(matrix_out=None)[source]
embedding_predict(key, matrix)[source]

Predict the output value using the matrix as input and the fitted embedding model from self.alternative_embedding_array

encoder_input_shape(key)[source]

Predict the output value using the matrix as input for the encoder from key

encoder_predict(key, matrix)[source]

Predict the output value using the matrix as input for the encoder from key

fit_alternative_embedding()[source]
fit_autoencoders(objective=None)[source]

fit the autoencoder using the training matrix

load_encoders(fname='encoder.h5')[source]

Load a keras model from the self.path_to_save_model directory :fname: str the name of the file to load

load_test_dataset()[source]

load test dataset and test surival

load_training_dataset()[source]

load training dataset and surival

save_encoders(fname='encoder.h5')[source]

Save a keras model in the self.path_to_save_model directory :fname: str the name of the file to save the model

simdeep.deepmodel_base.main()[source]

simdeep.extract_data module

class simdeep.extract_data.LoadData(path_data='/home/docs/checkouts/readthedocs.org/user_builds/deepprog-garmires-lab/checkouts/latest/simdeep/../examples/data/', training_tsv={'GE': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv', 'MIR': 'mir_dummy.tsv'}, survival_tsv='survival_dummy.tsv', metadata_tsv=None, metadata_test_tsv=None, test_tsv={'MIR': 'mir_test_dummy.tsv'}, survival_tsv_test='survival_test_dummy.tsv', cross_validation_instance=KFold(n_splits=5, random_state=1, shuffle=True), test_fold=0, stack_multi_omic=False, fill_unkown_feature_with_0=True, normalization={'NB_FEATURES_TO_KEEP': 100, 'TRAIN_CORR_RANK_NORM': True, 'TRAIN_CORR_REDUCTION': True, 'TRAIN_MAD_SCALE': False, 'TRAIN_MIN_MAX': False, 'TRAIN_NORM_SCALE': False, 'TRAIN_QUANTILE_TRANSFORM': False, 'TRAIN_RANK_NORM': True, 'TRAIN_ROBUST_SCALE': False, 'TRAIN_ROBUST_SCALE_TWO_WAY': False}, survival_flag={'event': 'recurrence', 'patient_id': 'barcode', 'survival': 'days'}, subset_training_with_meta={}, _autoencoder_parameters={}, verbose=True)[source]

Bases: object

create_a_cv_split()[source]
load_array()[source]
load_matrix_full()[source]
load_matrix_test(normalization=None)[source]
load_matrix_test_fold()[source]
load_meta_data(sep='\t')[source]
load_meta_data_test(metadata_file='', sep='\t')[source]
load_new_test_dataset(tsv_dict, path_survival_file=None, survival_flag=None, normalization=None, metadata_file=None)[source]
load_survival()[source]
load_survival_test(survival_flag=None)[source]
normalize_training_array()[source]
reorder_matrix_array(new_sample_ids)[source]
save_ref_matrix(path_folder, project_name)[source]
subset_training_sets(change_cv=False)[source]
transform_matrices(matrix_ref, matrix, key, normalization=None)[source]

simdeep.plot_utils module

class simdeep.plot_utils.SampleHTML(name, label, proba, survival)[source]

Bases: object

simdeep.plot_utils.make_color_dict(id_list)[source]

According to an id_list define a color gradient return {id:color}

simdeep.plot_utils.make_color_dict_from_r(labels)[source]
simdeep.plot_utils.make_color_list(id_list)[source]

According to an id_list define a color gradient return {id:color}

simdeep.plot_utils.plot_kernel_plots(test_labels, test_labels_proba, labels, activities, activities_test, dataset, path_html, metadata_frame=None)[source]

perform a html kernel plot

simdeep.simdeep_analysis module

DeepProg class for one instance model

class simdeep.simdeep_analysis.SimDeep(nb_clusters=2, pvalue_thres=0.01, cindex_thres=0.65, use_autoencoders=True, feature_surv_analysis=True, cluster_method='coxPHMixture', cluster_eval_method='silhouette', classifier_type='svm', project_name='test_dummy_dataset', path_results='./', cluster_array=[], nb_selected_features=50, mixture_params={'covariance_type': 'diag', 'max_iter': 1000, 'n_init': 100}, node_selection='Cox-PH', nb_threads_coxph=10, classification_method='ALL_FEATURES', load_existing_models=False, path_to_save_model='./', clustering_omics=[], metadata_usage=None, feature_selection_usage='individual', use_r_packages=False, seed=2020, alternative_embedding=None, do_KM_plot=True, verbose=True, _isboosting=False, dataset=None, kwargs_alternative_embedding={}, deep_model_additional_args={})[source]

Bases: DeepBase

Instanciate a new DeepProg instance. The default parameters are defined in the config.py file

Parameters:
dataset:

ExtractData instance. Default None (create a new dataset using the config variable)

nb_clusters:

Number of clusters to search (default NB_CLUSTERS)

pvalue_thres:

Pvalue threshold to include a feature (default PVALUE_THRESHOLD)

clustering_omics:

Which omics to use for clustering. If empty, then all the available omics will be used

cindex_thres:

C-index threshold to include a feature. This parameter is used only if node_selection is set to “C-index” (default CINDEX_THRESHOLD)

cluster_method:

Cluster method to use. possible choice [‘mixture’, ‘kmeans’]. (default CLUSTER_METHOD)

cluster_eval_method:

Cluster evaluation method to use in case the cluster_array parameter is a list of possible K. Possible choice [‘bic’, ‘silhouette’, ‘calinski’] (default CLUSTER_EVAL_METHOD)

classifier_type:

Type of classifier to use. Possible choice [‘svm’, ‘clustering’]. If ‘clustering’ is selected, The predict method of the clustering algoritm is used (default CLASSIFIER_TYPE)

project_name:

Name of the project. This name will be used to save the output files and create the output folder (default PROJECT_NAME)

path_results:

Result folder path used to save the output files (default PATH_RESULTS)

cluster_array:

Array of possible number of clusters to try. If set, nb_clusters is ignored (default CLUSTER_ARRAY)

nb_selected_features:

Number of selected features to construct classifiers (default NB_SELECTED_FEATURES)

mixture_params:

Dictionary of parameters used to instanciate the Gaussian mixture algorithm (default MIXTURE_PARAMS)

node_selection:

Mehtod to select new features. possible choice [‘Cox-PH’, ‘C-index’]. (default NODES_SELECTION)

nb_threads_coxph:

Number of python processes to use to compute individual survival models in parallel (default NB_THREADS_COXPH)

classification_method:

Possible choice [‘ALL_FEATURES’, ‘SURVIVAL_FEATURES’]. If ‘SURVIVAL_FEATURES’ is selected, the classifiers are built using survival features (default CLASSIFICATION_METHOD)

load_existing_models:

(default LOAD_EXISTING_MODELS)

path_to_save_model:

(default PATH_TO_SAVE_MODEL)

metadata_usage:

Meta data usage with survival models (if metadata_tsv provided as argument to the dataset). Possible choice are [None, False, ‘labels’, ‘new-features’, ‘all’, True] (True is the same as all)

feature_selection_usage:

selection method for survival features (‘individual’ or ‘lasso’)

alternative_embedding:

alternative external embedding to use instead of builfing autoencoders (default None)

kwargs_alternative_embedding:

parameters for external embedding fitting

compute_c_indexes_for_full_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_for_test_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_for_test_fold_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_for_training_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_multiple_for_test_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_multiple_for_test_fold_dataset()[source]

return c-index using test-fold labels as predicat

compute_feature_scores(use_ref=False)[source]
compute_feature_scores_per_cluster(use_ref=False, pval_thres=0.01)[source]
evalutate_cluster_performance()[source]
fit()[source]

main function I) construct an autoencoder or fit alternative embedding II) predict nodes linked with survival (if active) and III) do clustering

fit_classification_model()[source]
fit_classification_test_model()[source]
fit_on_pretrained_label_file(label_file)[source]

fit a deepprog simdeep model without training autoencoder but just using a ID->labels file to train a classifier

load_new_test_dataset(tsv_dict, fname_key=None, path_survival_file=None, normalization=None, survival_flag=None, metadata_file=None)[source]
look_for_prediction_nodes(keys=None)[source]

detect nodes from the autoencoder that predict a high c-index scores using label from the retained test fold

look_for_survival_nodes(keys=None)[source]

detect nodes from the autoencoder significantly linked with survival through coxph regression

plot_kernel_for_test_sets(dataset=None, labels=None, labels_proba=None, test_labels=None, test_labels_proba=None, define_as_main_kernel=False, use_main_kernel=False, activities=None, activities_test=None, key='')[source]
plot_supervised_kernel_for_test_sets(labels=None, labels_proba=None, dataset=None, key='', use_main_kernel=False, test_labels=None, test_labels_proba=None, define_as_main_kernel=False)[source]
predict_labels()[source]

predict labels from training set using K-Means algorithm on the node activities, using only nodes linked to survival

predict_labels_on_full_dataset()[source]
predict_labels_on_test_dataset()[source]
predict_labels_on_test_fold()[source]
predict_labels_using_external_labels(labels, labels_proba)[source]
predict_nodes_activities(matrix_array)[source]
write_feature_score_per_cluster()[source]
write_feature_scores()[source]

simdeep.simdeep_boosting module

class simdeep.simdeep_boosting.SimDeepBoosting(nb_it=10, do_KM_plot=True, distribute=False, nb_threads=5, class_selection='mean', model_thres=0.05, verbose=True, seed=None, project_name='test_dummy_dataset_boosting', use_autoencoders=True, feature_surv_analysis=True, split_n_fold=5, path_results='./', nb_clusters=2, epochs=10, normalization={'NB_FEATURES_TO_KEEP': 100, 'TRAIN_CORR_RANK_NORM': True, 'TRAIN_CORR_REDUCTION': True, 'TRAIN_MAD_SCALE': False, 'TRAIN_MIN_MAX': False, 'TRAIN_NORM_SCALE': False, 'TRAIN_QUANTILE_TRANSFORM': False, 'TRAIN_RANK_NORM': True, 'TRAIN_ROBUST_SCALE': False, 'TRAIN_ROBUST_SCALE_TWO_WAY': False}, nb_selected_features=50, cluster_method='coxPHMixture', pvalue_thres=0.01, classification_method='ALL_FEATURES', new_dim=100, training_tsv={'GE': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv', 'MIR': 'mir_dummy.tsv'}, metadata_usage=None, survival_tsv='survival_dummy.tsv', metadata_tsv=None, subset_training_with_meta={}, survival_flag={'event': 'recurrence', 'patient_id': 'barcode', 'survival': 'days'}, path_data='/home/docs/checkouts/readthedocs.org/user_builds/deepprog-garmires-lab/checkouts/latest/simdeep/../examples/data/', level_dims_in=(), level_dims_out=(), loss='binary_crossentropy', optimizer='adam', act_reg=False, w_reg=False, dropout=0.5, data_split=None, node_selection='Cox-PH', cindex_thres=0.65, activation='tanh', clustering_omics=[], path_to_save_model='./', feature_selection_usage='individual', use_r_packages=False, alternative_embedding=None, kwargs_alternative_embedding={}, **additional_dataset_args)[source]

Bases: object

Instanciate a new DeepProg Boosting instance. The default parameters are defined in the config.py file

Parameters:
nb_it:

Number of models to construct

do_KM_plot:

Plot Kaplan-Meier (default: True)

distribute:

Distribute DeepProg using ray (default: False)

nb_threads:

Number of python threads to use to compute parallel Cox-PH

class_selection:

Consensus score to agglomerate DeepProg Instance {‘mean’, ‘max’, ‘weighted_mean’, ‘weighted_max’} (default: ‘mean’)

model_thres:

Cox-PH p-value threshold to reject a model for DeepProg Boosting module

verbose:

Verobosity (Default: True)

seed:

Seed defining the random split of the training dataset (Default: None).

project_name:

Project name used to save files

use_autoencoders:

Use autoencoder steps to embed the data (default: True)

feature_surv_analysis:

Use individual survival feature detection to filter out features (default: True)

split_n_fold:

For each instance, the original dataset is split in folds and one fold is left

path_results:

Path to create a result folder

nb_clusters:

Number of clusters to use

epochs:

Number of epochs

normalization:

Normalisation procedure to use. See config.py file for details

nb_selected_features:

Number of top features selected for classification

cluster_method:

Clustering method. possible choice: [‘mixture’, ‘kmeans’, ‘coxPH’] or class instance having fit and fit_proba attributes

pvalue_thres:

Threshold for survival significance to set a node as valid

classification_method:

Possible choice: {‘ALL_FEATURES’, ‘SURVIVAL_FEATURES’} (default: ‘ALL_FEATURES’)

new_dim:

Size of the new embedding

training_tsv:

Input matrix files

survival_tsv:

Input surival file

survival_flag:

Survival flag to use

path_data:

Path of the input file

level_dims_in:

Autoencoder node layers before the middle layer (default: [])

level_dims_out:

Autoencoder node layers after the middle layer (default: [])

loss:

Loss function to minimize (default: ‘binary_crossentropy’)

optimizer:

Optimizer (default: adam)

act_reg:

L2 Regularization constant on the node activity (default: False)

w_reg:

L1 Regularization constant on the weight (default: False)

dropout:

Percentage of edges being dropout at each training iteration (None for no dropout) (default: 0.5)

data_split:

Fraction of the dataset to be used as test set when building the autoencoder (default: None)

node_selection:

possible choice: {‘Cox-PH’, ‘C-index’} (default: Cox-PH)

cindex_thres:

Valid if ‘c-index’ is chosen (default: 0.65)

activation:

Activation function (default: ‘tanh’)

clustering_omics:

Which omics to use for clustering. If empty, then all the available omics will be used (default [] => all)

path_to_save_model:

path to save the model

metadata_usage:

Meta data usage with survival models (if metadata_tsv provided as argument to the dataset). Possible choice are [None, False, ‘labels’, ‘new-features’, ‘all’, True] (True is the same as all)

subset_training_with_meta:

Use a metadata key-value dict {meta_key:value} to subset the training sets

alternative_embedding:

alternative external embedding to use instead of building autoencoders (default None)

kwargs_alternative_embedding:

parameters for external embedding fitting

collect_cindex_for_full_dataset()[source]
collect_cindex_for_test_dataset()[source]
collect_cindex_for_test_fold()[source]
collect_cindex_for_training_dataset()[source]
collect_number_of_features_per_omic()[source]
collect_pvalue_on_full_dataset()[source]
collect_pvalue_on_test_dataset()[source]
collect_pvalue_on_test_fold()[source]
collect_pvalue_on_training_dataset()[source]
compute_c_indexes_for_full_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_for_test_dataset()[source]

return c-index using labels as predicat

compute_c_indexes_multiple_for_test_dataset()[source]

Not Functionnal !

compute_clusters_consistency_for_full_labels()[source]
compute_clusters_consistency_for_test_labels()[source]
compute_feature_scores_per_cluster(pval_thres=0.001)[source]
compute_pvalue_for_merged_test_fold()[source]
compute_survival_feature_scores_per_cluster(pval_thres=0.001, use_meta=False)[source]
evalutate_cluster_performance()[source]
fit(debug=False, verbose=False, pretrained_labels_files=[])[source]

if pretrained_labels_files, is given, the models are constructed using these labels

fit_on_pretrained_label_file(labels_files=[], labels_files_folder='', file_name_regex='*.tsv', verbose=False, debug=False)[source]

fit a deepprog simdeep models without training autoencoders but using isntead ID->labels files (one for each model instance)

load_new_test_dataset(tsv_dict, fname_key=None, path_survival_file=None, normalization=None, debug=False, verbose=False, survival_flag=None, metadata_file=None)[source]
partial_fit(debug=False)[source]
plot_supervised_kernel_for_test_sets()[source]
plot_supervised_predicted_labels_for_test_sets(define_as_main_kernel=False, use_main_kernel=False)[source]
predict_labels_on_full_dataset()[source]
predict_labels_on_test_dataset()[source]
save_cv_models_classes(path_results='')[source]
save_models_classes(path_results='', use_cv_labels=False, use_test_labels=False)[source]
save_test_models_classes(path_results='')[source]
write_feature_score_per_cluster()[source]
write_logs()[source]

simdeep.simdeep_distributed module

simdeep.simdeep_multiple_dataset module

class simdeep.simdeep_multiple_dataset.SimDeepMultiple[source]

Bases: SimDeep

simdeep.simdeep_utils module

simdeep.simdeep_utils.feature_selection_usage_type(value)[source]
simdeep.simdeep_utils.load_labels_file(path_labels, sep='\t')[source]
simdeep.simdeep_utils.load_model(project_name, path_model='./')[source]
simdeep.simdeep_utils.metadata_usage_type(value)[source]
simdeep.simdeep_utils.save_model(boosting, path_to_save_model='./')[source]

simdeep.survival_utils module

class simdeep.survival_utils.CorrelationReducer(distance='correlation', threshold=None)[source]

Bases: object

fit(dataset)[source]
fit_transform(dataset)[source]
transform(dataset)[source]
class simdeep.survival_utils.MadScaler[source]

Bases: object

fit_transform(X)[source]
class simdeep.survival_utils.RankCorrNorm(dataset)[source]

Bases: object

class simdeep.survival_utils.RankNorm[source]

Bases: object

fit_transform(X)[source]
class simdeep.survival_utils.SampleReducer(perc_sample_to_keep=0.9)[source]

Bases: object

sample_to_keep(datasets, index=None)[source]
class simdeep.survival_utils.VarianceReducer(nb_features=200)[source]

Bases: object

fit(dataset)[source]
fit_transform(dataset)[source]
transform(dataset)[source]
simdeep.survival_utils.convert_metadata_frame_to_matrix(frame)[source]
simdeep.survival_utils.load_data_from_tsv(use_transpose=False, **kwargs)[source]
simdeep.survival_utils.load_entrezID_to_ensg()[source]
simdeep.survival_utils.load_survival_file(f_name, path_data='/home/docs/checkouts/readthedocs.org/user_builds/deepprog-garmires-lab/checkouts/latest/simdeep/../examples/data/', sep='\t', survival_flag={'event': 'recurrence', 'patient_id': 'barcode', 'survival': 'days'})[source]
simdeep.survival_utils.return_intersection_indexes(ids_1, ids_2)[source]
simdeep.survival_utils.save_matrix(matrix, feature_array, sample_array, path_folder, project_name, key='', sep='\t')[source]
simdeep.survival_utils.select_best_classif_params(clf)[source]

select best classifier parameters based uniquely on test errors

simdeep.survival_utils.translate_index(original_ids, new_ids)[source]

Module contents