import numpy as np
from patsy import dmatrix
from sklearn.model_selection import LeaveOneOut
from scipy import stats
import pandas as pd
from conpagnon.machine_learning.CPM_method import predictors_selection_linear_model, fit_model_on_training_set, \
compute_summary_subjects_summary_values, predictors_selection_correlation, predictor_selection_pcorrelation
[docs]def predict_behavior(vectorized_connectivity_matrices, behavioral_scores,
selection_predictor_method='correlation',
significance_selection_threshold=0.01,
confounding_variables=None, confounding_variables_kwarg=None):
"""The Connectome Predictive Modelling pipeline. This function select the predictors,
train/test a linear model on the selected predictors following a Leave One Out cross
validation scheme.
Parameters
----------
vectorized_connectivity_matrices: numpy.array of shape (n_subjects, n_features)
The stack of the vectorized (lower or upper triangle of the connectivity matrices)
connectivity matrices. Be careful, the matrices should be stack in the same order
as the vector of scores to predict !
behavioral_scores: numpy.array of shape (n_subject, 1)
The vector of scores to predict. The scores should be in the same
order as the vectorized connectivity matrices stack.
selection_predictor_method: str, optional
The predictors selection method. By default, a correlation between
each connectivity coefficient and scores is computed, and the resulted
correlation matrices is threshold at a type I error rate equal to 0.01.
Other selection are available: 'linear_model', 'partial correlation'.
significance_selection_threshold: float, optional
The significance threshold during the selection procedure. By default,
set to 0.01.
confounding_variables: list, optional
A list of the possible confounding variables you might
want to add, during the selection procedure only.
confounding_variables_kwarg: dict, optional
A dictionary with a field called 'file_path'. This field
should contains the full path to a file containing as
many columns as confounding variable.
Returns
-------
output 1: float
The correlation coefficient between the predicted and true scores
from the positively correlated set of features.
output 2: float
he correlation coefficient between the predicted and true scores
from the negatively correlated set of features.
"""
# Initialize leave one out object
leave_one_out_generator = LeaveOneOut()
# Initialization of behavior prediction vector
behavior_prediction_positive_edges = np.zeros(len(vectorized_connectivity_matrices.shape[0]))
behavior_prediction_negative_edges = np.zeros(len(vectorized_connectivity_matrices.shape[0]))
# Date preprocessing
if confounding_variables is not None:
# read confounding variable file
if confounding_variables_kwarg['file_path'].endswith('csv', 'txt'):
# Read the text file, and fetch the columns corresponding to the confounding variables
confounding_variables_data = pd.read_csv(confounding_variables_kwarg['file_path'])[confounding_variables]
# Construct the design matrix containing the confound variables
confounding_variables_matrix = dmatrix(formula_like='+'.join(confounding_variables),
data=confounding_variables_data,
return_type='dataframe').drop(['Intercept'], axis=1)
elif confounding_variables_kwarg['file_path'].endswith('xlsx'):
confounding_variables_data = pd.read_excel(confounding_variables_kwarg['file_path'])[confounding_variables]
# Construct the design matrix containing the confound variables
confounding_variables_matrix = dmatrix(formula_like='+'.join(confounding_variables),
data=confounding_variables_data,
return_type='dataframe').drop(['Intercept'], axis=1)
else:
raise ValueError('Datafile extension unrecognized')
for train_index, test_index in leave_one_out_generator.split(vectorized_connectivity_matrices):
# For each iteration, split the patients matrices array in train and
# test set using leave one out cross validation
patients_train_set, leave_one_out_patients = \
vectorized_connectivity_matrices[train_index], vectorized_connectivity_matrices[test_index]
# Training set behavioral scores
training_set_behavioral_score_ = np.zeros((patients_train_set.shape[0], 1))
training_set_behavioral_score_[:, 0] = behavioral_scores[train_index]
# The confounding variables, stored in an array for the training set
training_confound_variable_matrix = confounding_variables_matrix.iloc[train_index]
if selection_predictor_method == 'linear_model':
# Correlation of each edge to the behavioral score for training set
R_mat, P_mat = predictors_selection_linear_model(
training_connectivity_matrices=patients_train_set,
training_confound_variable_matrix=training_confound_variable_matrix,
training_set_behavioral_score=training_set_behavioral_score_)
# Compute summary values for both positive and negative edges model
negative_edges_mask, positive_edges_mask, negative_edges_summary_values, positive_edges_summary_values =\
compute_summary_subjects_summary_values(
training_connectivity_matrices=patients_train_set,
significance_selection_threshold=significance_selection_threshold,
R_mat=R_mat, P_mat=P_mat)
elif selection_predictor_method == 'correlation':
R_mat, P_mat = \
predictors_selection_correlation(training_connectivity_matrices=patients_train_set,
training_set_behavioral_scores=training_set_behavioral_score_)
negative_edges_mask, positive_edges_mask, negative_edges_summary_values, positive_edges_summary_values =\
compute_summary_subjects_summary_values(
training_connectivity_matrices=patients_train_set,
significance_selection_threshold=significance_selection_threshold,
R_mat=R_mat, P_mat=P_mat)
elif selection_predictor_method == 'partial correlation':
R_mat, P_mat = predictor_selection_pcorrelation(
training_connectivity_matrices=patients_train_set,
training_set_behavioral_scores=training_set_behavioral_score_,
training_set_confounding_variables=training_confound_variable_matrix)
negative_edges_mask, positive_edges_mask, negative_edges_summary_values, positive_edges_summary_values =\
compute_summary_subjects_summary_values(
training_connectivity_matrices=patients_train_set,
significance_selection_threshold=significance_selection_threshold,
R_mat=R_mat, P_mat=P_mat)
else:
raise ValueError('Selection method not understood')
# Fit a linear model on the training set
positive_edge_model_fit, negative_edge_model_fit = fit_model_on_training_set(
negative_edges_summary_values=negative_edges_summary_values,
positive_edges_summary_values=positive_edges_summary_values,
training_set_behavioral_score=training_set_behavioral_score_)
# Test the positive edges model on the left out subject
test_subject_positive_edges_summary = np.sum(np.multiply(leave_one_out_patients[0, :], positive_edges_mask))
test_subject_negative_edges_summary = np.sum(np.multiply(leave_one_out_patients[0, :], negative_edges_mask))
# Fit the model of on the left out subject
behavior_prediction_negative_edges[test_index] = \
negative_edge_model_fit.params[1]*test_subject_negative_edges_summary + \
negative_edge_model_fit.params[0]
behavior_prediction_positive_edges[test_index] = \
positive_edge_model_fit.params[1]*test_subject_positive_edges_summary + positive_edge_model_fit.params[0]
# Compare prediction and true behavioral score
R_predict_negative_model, _ = \
stats.pearsonr(x=behavior_prediction_negative_edges,
y=np.array(behavioral_scores))
R_predict_positive_model, _ = \
stats.pearsonr(x=np.array(behavioral_scores),
y=behavior_prediction_positive_edges)
return R_predict_positive_model, R_predict_negative_model