Source code for conpagnon.machine_learning.cpm_predict_behavior

import numpy as np
from patsy import dmatrix
from sklearn.model_selection import LeaveOneOut
from scipy import stats
import pandas as pd
from conpagnon.machine_learning.CPM_method import predictors_selection_linear_model, fit_model_on_training_set, \
    compute_summary_subjects_summary_values, predictors_selection_correlation, predictor_selection_pcorrelation


[docs]def predict_behavior(vectorized_connectivity_matrices, behavioral_scores, selection_predictor_method='correlation', significance_selection_threshold=0.01, confounding_variables=None, confounding_variables_kwarg=None): """The Connectome Predictive Modelling pipeline. This function select the predictors, train/test a linear model on the selected predictors following a Leave One Out cross validation scheme. Parameters ---------- vectorized_connectivity_matrices: numpy.array of shape (n_subjects, n_features) The stack of the vectorized (lower or upper triangle of the connectivity matrices) connectivity matrices. Be careful, the matrices should be stack in the same order as the vector of scores to predict ! behavioral_scores: numpy.array of shape (n_subject, 1) The vector of scores to predict. The scores should be in the same order as the vectorized connectivity matrices stack. selection_predictor_method: str, optional The predictors selection method. By default, a correlation between each connectivity coefficient and scores is computed, and the resulted correlation matrices is threshold at a type I error rate equal to 0.01. Other selection are available: 'linear_model', 'partial correlation'. significance_selection_threshold: float, optional The significance threshold during the selection procedure. By default, set to 0.01. confounding_variables: list, optional A list of the possible confounding variables you might want to add, during the selection procedure only. confounding_variables_kwarg: dict, optional A dictionary with a field called 'file_path'. This field should contains the full path to a file containing as many columns as confounding variable. Returns ------- output 1: float The correlation coefficient between the predicted and true scores from the positively correlated set of features. output 2: float he correlation coefficient between the predicted and true scores from the negatively correlated set of features. """ # Initialize leave one out object leave_one_out_generator = LeaveOneOut() # Initialization of behavior prediction vector behavior_prediction_positive_edges = np.zeros(len(vectorized_connectivity_matrices.shape[0])) behavior_prediction_negative_edges = np.zeros(len(vectorized_connectivity_matrices.shape[0])) # Date preprocessing if confounding_variables is not None: # read confounding variable file if confounding_variables_kwarg['file_path'].endswith('csv', 'txt'): # Read the text file, and fetch the columns corresponding to the confounding variables confounding_variables_data = pd.read_csv(confounding_variables_kwarg['file_path'])[confounding_variables] # Construct the design matrix containing the confound variables confounding_variables_matrix = dmatrix(formula_like='+'.join(confounding_variables), data=confounding_variables_data, return_type='dataframe').drop(['Intercept'], axis=1) elif confounding_variables_kwarg['file_path'].endswith('xlsx'): confounding_variables_data = pd.read_excel(confounding_variables_kwarg['file_path'])[confounding_variables] # Construct the design matrix containing the confound variables confounding_variables_matrix = dmatrix(formula_like='+'.join(confounding_variables), data=confounding_variables_data, return_type='dataframe').drop(['Intercept'], axis=1) else: raise ValueError('Datafile extension unrecognized') for train_index, test_index in leave_one_out_generator.split(vectorized_connectivity_matrices): # For each iteration, split the patients matrices array in train and # test set using leave one out cross validation patients_train_set, leave_one_out_patients = \ vectorized_connectivity_matrices[train_index], vectorized_connectivity_matrices[test_index] # Training set behavioral scores training_set_behavioral_score_ = np.zeros((patients_train_set.shape[0], 1)) training_set_behavioral_score_[:, 0] = behavioral_scores[train_index] # The confounding variables, stored in an array for the training set training_confound_variable_matrix = confounding_variables_matrix.iloc[train_index] if selection_predictor_method == 'linear_model': # Correlation of each edge to the behavioral score for training set R_mat, P_mat = predictors_selection_linear_model( training_connectivity_matrices=patients_train_set, training_confound_variable_matrix=training_confound_variable_matrix, training_set_behavioral_score=training_set_behavioral_score_) # Compute summary values for both positive and negative edges model negative_edges_mask, positive_edges_mask, negative_edges_summary_values, positive_edges_summary_values =\ compute_summary_subjects_summary_values( training_connectivity_matrices=patients_train_set, significance_selection_threshold=significance_selection_threshold, R_mat=R_mat, P_mat=P_mat) elif selection_predictor_method == 'correlation': R_mat, P_mat = \ predictors_selection_correlation(training_connectivity_matrices=patients_train_set, training_set_behavioral_scores=training_set_behavioral_score_) negative_edges_mask, positive_edges_mask, negative_edges_summary_values, positive_edges_summary_values =\ compute_summary_subjects_summary_values( training_connectivity_matrices=patients_train_set, significance_selection_threshold=significance_selection_threshold, R_mat=R_mat, P_mat=P_mat) elif selection_predictor_method == 'partial correlation': R_mat, P_mat = predictor_selection_pcorrelation( training_connectivity_matrices=patients_train_set, training_set_behavioral_scores=training_set_behavioral_score_, training_set_confounding_variables=training_confound_variable_matrix) negative_edges_mask, positive_edges_mask, negative_edges_summary_values, positive_edges_summary_values =\ compute_summary_subjects_summary_values( training_connectivity_matrices=patients_train_set, significance_selection_threshold=significance_selection_threshold, R_mat=R_mat, P_mat=P_mat) else: raise ValueError('Selection method not understood') # Fit a linear model on the training set positive_edge_model_fit, negative_edge_model_fit = fit_model_on_training_set( negative_edges_summary_values=negative_edges_summary_values, positive_edges_summary_values=positive_edges_summary_values, training_set_behavioral_score=training_set_behavioral_score_) # Test the positive edges model on the left out subject test_subject_positive_edges_summary = np.sum(np.multiply(leave_one_out_patients[0, :], positive_edges_mask)) test_subject_negative_edges_summary = np.sum(np.multiply(leave_one_out_patients[0, :], negative_edges_mask)) # Fit the model of on the left out subject behavior_prediction_negative_edges[test_index] = \ negative_edge_model_fit.params[1]*test_subject_negative_edges_summary + \ negative_edge_model_fit.params[0] behavior_prediction_positive_edges[test_index] = \ positive_edge_model_fit.params[1]*test_subject_positive_edges_summary + positive_edge_model_fit.params[0] # Compare prediction and true behavioral score R_predict_negative_model, _ = \ stats.pearsonr(x=behavior_prediction_negative_edges, y=np.array(behavioral_scores)) R_predict_positive_model, _ = \ stats.pearsonr(x=np.array(behavioral_scores), y=behavior_prediction_positive_edges) return R_predict_positive_model, R_predict_negative_model