From 6a9e5cba965d9a0c4b7e2bb53bde28da689c4509 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 3 Apr 2024 16:04:25 -0400 Subject: [PATCH 001/401] start simul branch --- simul_dFC/FCS_estimate.py | 144 +++++++++++++++++++++++++ simul_dFC/dFC_assessment.py | 102 ++++++++++++++++++ simul_dFC/task_data_simulator.py | 176 +++++++++++++++++++++++++++++++ 3 files changed, 422 insertions(+) create mode 100644 simul_dFC/FCS_estimate.py create mode 100644 simul_dFC/dFC_assessment.py create mode 100644 simul_dFC/task_data_simulator.py diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py new file mode 100644 index 0000000..caf2aa0 --- /dev/null +++ b/simul_dFC/FCS_estimate.py @@ -0,0 +1,144 @@ +import os +import time +import warnings + +import numpy as np + +from pydfc import MultiAnalysis, data_loader + +warnings.simplefilter("ignore") + +os.environ["MKL_NUM_THREADS"] = "16" +os.environ["NUMEXPR_NUM_THREADS"] = "16" +os.environ["OMP_NUM_THREADS"] = "16" + +################################# Parameters ################################# +# data paths +dataset = "ds000001" +# main_root = f"./DATA/{dataset}" # for local +main_root = f"../../DATA/task-based/simulated/{dataset}" # for server +roi_root = f"{main_root}/derivatives/ROI_timeseries" +output_root = f"{main_root}/derivatives/fitted_MEASURES" + +# for consistency we use 0 for resting state +TASKS = ["task-pulse"] + +# job_id = int(os.getenv("SGE_TASK_ID")) +# TASK_id = job_id-1 # SGE_TASK_ID starts from 1 not 0 +# if TASK_id >= len(TASKS): +# print("TASK_id out of TASKS") +# exit() +TASK_id = 0 +task = TASKS[TASK_id] + +###### MEASUREMENT PARAMETERS ###### + +# W is in sec + +params_methods = { + # Sliding Parameters + "W": 12, + "n_overlap": 1.0, + "sw_method": "pear_corr", + "tapered_window": True, + # TIME_FREQ + "TF_method": "WTC", + # CLUSTERING AND DHMM + "clstr_base_measure": "SlidingWindow", + # HMM + "hmm_iter": 20, + "dhmm_obs_state_ratio": 16 / 24, + # State Parameters + "n_states": 5, + "n_subj_clstrs": 10, + # Parallelization Parameters + "n_jobs": 2, + "verbose": 0, + "backend": "loky", + # SESSION + "session": task, + # Hyper Parameters + "normalization": True, + "num_subj": None, # None or 200? + "num_time_point": None, # None or set? +} + +###### HYPER PARAMETERS ALTERNATIVE ###### + +MEASURES_name_lst = [ + "SlidingWindow", + "Time-Freq", + "CAP", + "ContinuousHMM", + "Windowless", + "Clustering", + "DiscreteHMM", +] + +alter_hparams = { + # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'], + # 'n_overlap': [0, 0.25, 0.75, 1], + # 'n_states': [6, 16], + # # 'normalization': [], + # 'num_subj': [50, 100, 200], + # 'num_select_nodes': [30, 50, 333], + # 'num_time_point': [800, 1000], + # 'Fs_ratio': [0.50, 0.75, 1.5], + # 'noise_ratio': [1.00, 2.00, 3.00], + # 'num_realization': [] +} + +###### MultiAnalysis PARAMETERS ###### + +params_multi_analysis = { + # Parallelization Parameters + "n_jobs": None, + "verbose": 0, + "backend": "loky", +} + +################################# LOAD DATA ################################# + +BOLD = data_loader.load_TS( + data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None +) + +################################# Visualize BOLD ################################# + +# for session in BOLD: +# BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)), +# save_image=False, output_root=None) + +################################ Measures of dFC ################################# + +MA = MultiAnalysis( + analysis_name=f"simulated-task-based-dFC-{dataset}-{task}", **params_multi_analysis +) + +MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams) + +tic = time.time() +print("Measurement Started ...") + +################################# estimate FCS ################################# + +for MEASURE_id, measure in enumerate(MEASURES_lst): + + print("MEASURE: " + measure.measure_name) + print("FCS estimation started...") + + if measure.is_state_based: + measure.estimate_FCS(time_series=BOLD) + + # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD) + print("FCS estimation done.") + + # Save + if not os.path.exists(f"{output_root}/{task}"): + os.makedirs(f"{output_root}/{task}") + np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure) + +print(f"Measurement required {time.time() - tic:0.3f} seconds.") +np.save(f"{output_root}/{task}/multi_analysis.npy", MA) + +################################################################################# diff --git a/simul_dFC/dFC_assessment.py b/simul_dFC/dFC_assessment.py new file mode 100644 index 0000000..d140bd6 --- /dev/null +++ b/simul_dFC/dFC_assessment.py @@ -0,0 +1,102 @@ +import os +import time +import warnings + +import numpy as np + +from pydfc import MultiAnalysis, data_loader + +warnings.simplefilter("ignore") + +os.environ["MKL_NUM_THREADS"] = "16" +os.environ["NUMEXPR_NUM_THREADS"] = "16" +os.environ["OMP_NUM_THREADS"] = "16" + +################################# Parameters ################################# + +# Data parameters +dataset = "ds000001" +# main_root = f"./DATA/{dataset}" # for local +main_root = f"../../DATA/task-based/simulated/{dataset}" # for server + +# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate +# you can set the new roi root and data load parameters here: +roi_root = f"{main_root}/derivatives/ROI_timeseries" +fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES" +output_root = f"{main_root}/derivatives/dFC_assessed" + +# for consistency we use 0 for resting state. will this cause a problem here?? +TASKS = ["task-pulse"] + +# find all subjects across all tasks +SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS) + +# job_id selects the subject +job_id = int(os.getenv("SGE_TASK_ID")) +if job_id > len(SUBJECTS): + print("job_id > len(SUBJECTS)") + exit() +subj_id = SUBJECTS[job_id - 1] # SGE_TASK_ID starts from 1 not 0 + +for task in TASKS: + + MA = np.load( + f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE" + ).item() + + # check if the subject has this task + SUBJECTS_with_this_task = data_loader.find_subj_list( + data_root=roi_root, sessions=[task] + ) + if not subj_id in SUBJECTS_with_this_task: + print(f"subject {subj_id} not in the list of subjects with task {task}") + continue + + ################################# LOAD FIT MEASURES ################################# + + ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/") + ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i] + ALL_RECORDS.sort() + MEASURES_fit_lst = list() + for s in ALL_RECORDS: + fit_measure = np.load( + f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE" + ).item() + MEASURES_fit_lst.append(fit_measure) + MA.set_MEASURES_fit_lst(MEASURES_fit_lst) + print("fitted MEASURES loaded ...") + + ################################# LOAD DATA ################################# + + print( + f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..." + ) + + BOLD = data_loader.load_TS( + data_root=roi_root, + file_name="time_series.npy", + SESSIONs=[task], + subj_id2load=subj_id, + ) + + ################################# dFC ASSESSMENT ################################# + + tic = time.time() + print("Measurement Started ...") + + print("dFC estimation started...") + dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD) + print("dFC estimation done.") + + print(f"Measurement required {time.time() - tic:0.3f} seconds.") + + ################################# SAVE DATA ################################# + + folder = f"{output_root}/{task}/{subj_id}" + if not os.path.exists(folder): + os.makedirs(folder) + + for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]): + np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC) + +####################################################################################### diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py new file mode 100644 index 0000000..98fa832 --- /dev/null +++ b/simul_dFC/task_data_simulator.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed March 20 2024 + +@author: mte +""" +import os +import warnings + +import numpy as np +from tvb.simulator.lab import * + +from pydfc import TIME_SERIES, task_utils + +warnings.simplefilter("ignore") + +os.environ["MKL_NUM_THREADS"] = "16" +os.environ["NUMEXPR_NUM_THREADS"] = "16" +os.environ["OMP_NUM_THREADS"] = "16" +################################# Parameters #################################### + +# data paths +dataset = "ds000002" +# main_root = f"./DATA/{dataset}" # for local +main_root = f"../../DATA/task-based/simulated/{dataset}" # for server +output_root = f"{main_root}/derivatives/ROI_timeseries" + +task = "task-pulse" + +# simulation parameters +sim_length = 250e3 # in m sec +onset_time = 20.0 # in seconds +task_duration = 12.0 # in seconds +task_block_duration = 30.0 # in seconds +BOLD_period = 500 # in m sec +TAVG_period = 1.0 # in m sec +conn_speed = 1.0 +D = 0.001 # noise dispersion +dt = 0.5 # integration step +n_subj = 200 # number of subjects + +# create a subject id list +subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)] + +job_id = int(os.getenv("SGE_TASK_ID")) +subj_id = subj_list[job_id - 1] # SGE_TASK_ID starts from 1 not 0 + +print(f"subject-level simulation started running ... for subject: {subj_id} ...") + +# randomize some parameters for each subjects +onset = np.random.normal(loc=onset_time, scale=0.5) # seconds +global_conn_coupling = np.random.normal(loc=0.0126, scale=0.0075) +rand_weighting = np.array( + [ + np.random.normal(loc=2.0**-2, scale=0.1 * (2.0**-2)), + np.random.normal(loc=2.0**-3, scale=0.1 * (2.0**-3)), + np.random.normal(loc=2.0**-4, scale=0.1 * (2.0**-4)), + np.random.normal(loc=2.0**-5, scale=0.1 * (2.0**-5)), + np.random.normal(loc=2.0**-6, scale=0.1 * (2.0**-6)), + ] +) +conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed) +################################# Initialize Simulation #################################### +conn = connectivity.Connectivity.from_file() +conn.speed = np.array([conn_speed_rand]) + +# configure stimulus spatial pattern +weighting = np.zeros((76,)) +weighting[[0, 7, 13, 33, 42]] = rand_weighting +# weighting[[0, 7, 13, 33, 42]] = numpy.array([2.0 ** -2, 2.0 ** -3, 2.0 ** -4, 2.0 ** -5, 2.0 ** -6]) + +# temporal profile +eqn_t = equations.PulseTrain() +eqn_t.parameters["onset"] = onset * 1e3 # ms +eqn_t.parameters["tau"] = task_duration * 1e3 # ms +eqn_t.parameters["T"] = task_block_duration * 1e3 # ms + +stimulus = patterns.StimuliRegion(temporal=eqn_t, connectivity=conn, weight=weighting) + +################################# Run Simulation #################################### + +# set the global coupling strength +# you can switch between deterministic (without noise) and stochastic integration (with noise) +sim = simulator.Simulator( + model=models.Generic2dOscillator(a=np.array([0.5])), + connectivity=conn, + coupling=coupling.Linear(a=np.array([global_conn_coupling])), + # integrator=integrators.HeunDeterministic(dt=dt), + integrator=integrators.HeunStochastic( + dt=dt, noise=noise.Additive(nsig=np.array([D])) + ), + monitors=( + monitors.TemporalAverage(period=TAVG_period), + monitors.Bold(period=BOLD_period, hrf_kernel=equations.MixtureOfGammas()), + monitors.ProgressLogger(period=10e3), + ), + stimulus=stimulus, + simulation_length=sim_length, +).configure() + +(tavg_time, tavg_data), (bold_time, bold_data), _ = sim.run() + +# # truncate the first 10 seconds of the simulation +# # to avoid transient effects +# truncate_time = 10e3 # in m sec +# bold_truncate_idx = int(truncate_time / BOLD_period) +# bold_time = bold_time[bold_truncate_idx:] +# bold_data = bold_data[bold_truncate_idx:] +# tavg_truncate_idx = int(truncate_time / TAVG_period) +# tavg_time = tavg_time[tavg_truncate_idx:] +# tavg_data = tavg_data[tavg_truncate_idx:] + +centres_locs = conn.centres +region_labels = list(conn.region_labels) +TR_mri = BOLD_period * 1e-3 # in seconds + +bold_data = bold_data[:, 0, :, 0] +# change time_series.shape to (roi, time) +bold_data = bold_data.T + +time_series = TIME_SERIES( + data=bold_data, + subj_id=subj_id, + Fs=1 / TR_mri, + locs=centres_locs, + node_labels=region_labels, + TS_name=f"BOLD_{subj_id}_{task}", + session_name=task, +) +num_time_mri = time_series.n_time +################################# EXTRACT TASK LABELS ######################### +oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution + +events = [] +event_types = ["rest", "task"] +TASKS = [task] + +# using onset, task_duration, task_block_duration to create the events +events.append(["onset", "duration", "trial_type"]) +t = onset +while t < sim_length: + events.append([t, task_duration, "task"]) + t += task_block_duration +events = np.array(events) + +event_labels, Fs_task = task_utils.events_time_to_labels( + events=events, + TR_mri=TR_mri, + num_time_mri=num_time_mri, + event_types=event_types, + oversampling=oversampling, + return_0_1=False, +) +# fill task labels with 0 (rest) and 1 (task's index, here only 1 task is used) +task_labels = np.multiply(event_labels != 0, 1) +################################# SAVE ################################# +# save the ROI time series and task data +task_data = { + "task": task, + "task_labels": task_labels, + "task_types": TASKS, + "event_labels": event_labels, + "event_types": event_types, + "events": events, + "Fs_task": Fs_task, + "TR_mri": TR_mri, + "num_time_mri": num_time_mri, +} +subj_folder = f"{subj_id}_{task}" +if not os.path.exists(f"{output_root}/{subj_folder}/"): + os.makedirs(f"{output_root}/{subj_folder}/") +np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series) +np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data) + +print("****************** DONE ******************") +#################################################################################### From 80ab1b0f8024b24c825cbe73a85d681551efa709 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 4 Apr 2024 15:20:58 -0400 Subject: [PATCH 002/401] add task features --- pydfc/task_utils.py | 68 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index daaf95e..982a2d8 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -224,11 +224,11 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"): return events_hrf_ds -def extract_task_presence(event_labels, TR_task, TR_array, TR_mri, binary=True): +def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=True): """ event_labels: event labels including 0 and event ids at the time each event happens TR_task: TR of task - TR_array: the time points of the dFC data + TR_array: the time points of the dFC data, optional TR_mri: TR of MRI This function extracts the task presence from the event labels and returns it in the same time points as the dFC data @@ -262,6 +262,68 @@ def extract_task_presence(event_labels, TR_task, TR_array, TR_mri, binary=True): task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task) # some dFC measures (window-based) have a different TR than the task data - task_presence = task_presence[TR_array] + if TR_array is not None: + task_presence = task_presence[TR_array] return task_presence + + +################################# Task Features #################################### + + +def relative_task_on(task_presence): + """ + task_presence: 0, 1 array + return: relative_task_on + """ + return np.sum(task_presence) / len(task_presence) + + +def task_duration(task_presence, TR_mri): + """ + task_presence: 0, 1 array + return: avg_task_duration, var_task_duration + """ + task_durations = list() + for i in range(1, len(task_presence)): + if task_presence[i] == 1 and task_presence[i - 1] == 0: + start = i + if task_presence[i] == 0 and task_presence[i - 1] == 1: + end = i + task_durations.append((end - start) * TR_mri) + start = None + task_durations = np.array(task_durations) + return np.mean(task_durations), np.var(task_durations) + + +def rest_duration(task_presence, TR_mri): + """ + task_presence: 0, 1 array + return: avg_rest_duration, var_rest_duration + """ + rest_durations = list() + if task_presence[0] == 0: + start = 0 + for i in range(1, len(task_presence)): + if task_presence[i] == 0 and task_presence[i - 1] == 1: + start = i + if task_presence[i] == 1 and task_presence[i - 1] == 0: + end = i + rest_durations.append((end - start) * TR_mri) + start = None + if task_presence[-1] == 0: + end = len(task_presence) + rest_durations.append((end - start) * TR_mri) + rest_durations = np.array(rest_durations) + return np.mean(rest_durations), np.var(rest_durations) + + +def transition_freq(task_presence): + """ + task_presence: 0, 1 array + return: num_of_transitions, relative_transition_freq + """ + transitions = np.abs(np.diff(task_presence)) + num_of_transitions = np.sum(transitions) + relative_transition_freq = num_of_transitions / len(task_presence) + return num_of_transitions, relative_transition_freq From 1300355968abce83ed11fb8f879db021df2884de Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 9 Apr 2024 14:46:02 -0400 Subject: [PATCH 003/401] add KNN_ML --- simul_dFC/KNN_ML.py | 249 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 simul_dFC/KNN_ML.py diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py new file mode 100644 index 0000000..bf1a6c9 --- /dev/null +++ b/simul_dFC/KNN_ML.py @@ -0,0 +1,249 @@ +import os + +import numpy as np +from sklearn.decomposition import PCA +from sklearn.metrics import balanced_accuracy_score +from sklearn.model_selection import GridSearchCV, train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +from pydfc import DFC, data_loader, task_utils +from pydfc.dfc_utils import dFC_mat2vec, rank_norm + +# Data parameters +dataset = "ds000001" + +# main_root = f"./DATA/{dataset}" # for local +main_root = f"../../DATA/task-based/simulated/{dataset}" # for server +roi_root = f"{main_root}/derivatives/ROI_timeseries" +dFC_root = f"{main_root}/derivatives/dFC_assessed" +output_root = "./ML_RESULTS_KNN_classify" + +TASKS = ["task-pulse"] + +normalize_dFC = True + +SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS) + +# randomly select 80% of the subjects for training and 20% for testing using numpy.random.choice +train_subjects = np.random.choice(SUBJECTS, int(0.8 * len(SUBJECTS)), replace=False) +test_subjects = np.setdiff1d(SUBJECTS, train_subjects) + +print( + f"number of train_subjects: {len(train_subjects)} and test_subjects: {len(test_subjects)}" +) + + +################## TASK FEATURES ################## + +task_features = { + "task": list(), + "relative_task_on": list(), + "avg_task_duration": list(), + "var_task_duration": list(), + "avg_rest_duration": list(), + "var_rest_duration": list(), + "num_of_transitions": list(), + "relative_transition_freq": list(), +} +for task_id, task in enumerate(TASKS): + + if task == "task-restingstate": + continue + + for subj in SUBJECTS: + # event data + task_data = np.load( + f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE" + ).item() + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + ) + + relative_task_on = task_utils.relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = task_utils.task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = task_utils.rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = task_utils.transition_freq( + task_presence + ) + + task_features["task"].append(task) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append(relative_transition_freq) + + +################## TASK PRESENCE CLASSIFICATION ################## +ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "dFC method": list(), + "KNN accuracy": list(), +} +for dFC_id in range(0, 7): + print(f"=================== dFC {dFC_id} ===================") + + ML_RESULT = {} + for task_id, task in enumerate(TASKS): + print(f"=============== {task} ===============") + + if task == "task-restingstate": + continue + + X_train = None + X_test = None + y_condition_train = None + y_condition_test = None + subj_label_train = list() + subj_label_test = list() + + for subj in SUBJECTS: + + dFC = np.load( + f"{dFC_root}/{task}/{subj}/dFC_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + + dFC_mat = dFC.get_dFC_mat() + TR_array = dFC.TR_array + if normalize_dFC: + dFC_mat = rank_norm(dFC_mat) + + dFC_vecs = dFC_mat2vec(dFC_mat) + + # event data + task_data = np.load( + f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE" + ).item() + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + TR_array=TR_array, + binary=True, + ) + + X_new = dFC_vecs + y_new = task_presence.ravel() + + # concat current TR and two TR before of X_new to predict the current TR of y_new + # ignore the edge case of the first two TRs + X_new = np.concatenate( + (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1 + ) + X_new = X_new[2:, :] + y_new = y_new[2:] + + if subj in train_subjects: + subj_label_train.extend([subj for i in range(X_new.shape[0])]) + if X_train is None and y_condition_train is None: + X_train = X_new + y_condition_train = y_new + else: + X_train = np.concatenate((X_train, X_new), axis=0) + y_condition_train = np.concatenate((y_condition_train, y_new), axis=0) + elif subj in test_subjects: + subj_label_test.extend([subj for i in range(X_new.shape[0])]) + if X_test is None and y_condition_test is None: + X_test = X_new + y_condition_test = y_new + else: + X_test = np.concatenate((X_test, X_new), axis=0) + y_condition_test = np.concatenate((y_condition_test, y_new), axis=0) + + print( + X_train.shape, X_test.shape, y_condition_train.shape, y_condition_test.shape + ) + subj_label_train = np.array(subj_label_train) + subj_label_test = np.array(subj_label_test) + print(subj_label_train.shape, subj_label_test.shape) + + # task presence classification + + print("task presence classification ...") + + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_train) + num_PCs = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1 + + # create new a knn model + knn = KNeighborsClassifier() + # create a dictionary of all values we want to test for n_neighbors + param_grid = {"n_neighbors": np.arange(1, 30)} + # use gridsearch to test all values for n_neighbors + knn_gscv = GridSearchCV(knn, param_grid, cv=5) + # fit model to data + knn_gscv.fit(X_train, y_condition_train) + + n_neighbors = knn_gscv.best_params_["n_neighbors"] + + neigh = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(n_neighbors=n_neighbors), + ).fit(X_train, y_condition_train) + + ML_RESULT[task] = { + "pca": pca, + "num_PCs": num_PCs, + "cv_results": knn_gscv.cv_results_, + "KNN": neigh, + "KNN train score": neigh.score(X_train, y_condition_train), + "KNN test score": neigh.score(X_test, y_condition_test), + } + + print( + f"KNN train score {dFC.measure.measure_name} {task}: {neigh.score(X_train, y_condition_train)}" + ) + print( + f"KNN test score {dFC.measure.measure_name} {task}: {neigh.score(X_test, y_condition_test)}" + ) + + # measure pred score on each subj + + for subj in SUBJECTS: + ML_scores["subj_id"].append(subj) + if subj in train_subjects: + ML_scores["group"].append("train") + features = X_train[subj_label_train == subj, :] + target = y_condition_train[subj_label_train == subj] + elif subj in test_subjects: + ML_scores["group"].append("test") + features = X_test[subj_label_test == subj, :] + target = y_condition_test[subj_label_test == subj] + + pred = neigh.predict(features) + + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) + + ML_scores["task"].append(task) + ML_scores["dFC method"].append(dFC.measure.measure_name) + + folder = f"{output_root}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/ML_RESULT_{dFC.measure.measure_name}.npy", ML_RESULT) + +np.save(f"{folder}/task_features_KNN_classify.npy", task_features) +np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores) From eb69b24cbacbc95dc9a5e5020eed43f039792ebe Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 9 Apr 2024 15:01:56 -0400 Subject: [PATCH 004/401] update plot_task_dfc --- pydfc/task_utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index 982a2d8..3fe2870 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -59,11 +59,11 @@ def events_time_to_labels( ################################# Visualization Functions #################################### -def plot_task_dFC(task_labels, dFC_lst, event_types, Fs_mri, TR_step=12): +def plot_task_dFC(task_presence, dFC_lst, Fs_mri, TR_step=12): """ - task_labels: numpy array of shape (num_time_task, num_event_types) containing the event or task labels - this function assumes that the task data has the same Fs as the dFC data, i.e. MRI data - and that the time points of the task data are aligned with the time points of the dFC data + task_presence: numpy array containing the task presence in the time points of the dFC data + this function assumes that the task presence has the same Fs as the dFC data, i.e. MRI data + and that the time points of the task presence are aligned with the time points of the dFC data """ conn_mat_size = 20 scale_task_plot = 20 @@ -73,12 +73,8 @@ def plot_task_dFC(task_labels, dFC_lst, event_types, Fs_mri, TR_step=12): ax = plt.gca() - time = np.arange(0, task_labels.shape[0]) / Fs_mri - for i in range(0, task_labels.shape[1]): - ax.plot( - time, task_labels[:, i] * scale_task_plot, label=event_types[i], linewidth=4 - ) - plt.legend() + time = np.arange(0, task_presence.shape[0]) / Fs_mri + ax.plot(time, task_presence * scale_task_plot, linewidth=4) plt.xlabel("Time (s)") comman_TRs = TR_intersection(dFC_lst) From 839928f30e8dcd747a6f28817fc4c7a4bb44d22b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 10 Apr 2024 13:42:43 -0400 Subject: [PATCH 005/401] add two TRs after for ML --- simul_dFC/KNN_ML.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py index bf1a6c9..4de7f76 100644 --- a/simul_dFC/KNN_ML.py +++ b/simul_dFC/KNN_ML.py @@ -147,13 +147,28 @@ X_new = dFC_vecs y_new = task_presence.ravel() - # concat current TR and two TR before of X_new to predict the current TR of y_new - # ignore the edge case of the first two TRs + # # concat current TR and two TR before of X_new to predict the current TR of y_new + # # ignore the edge case of the first two TRs + # X_new = np.concatenate( + # (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1 + # ) + # X_new = X_new[2:, :] + # y_new = y_new[2:] + + # concat current TR and two TR before and after of X_new to predict the current TR of y_new + # ignore the edge case of the first and last two TRs X_new = np.concatenate( - (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1 + ( + X_new, + np.roll(X_new, 1, axis=0), + np.roll(X_new, 2, axis=0), + np.roll(X_new, -1, axis=0), + np.roll(X_new, -2, axis=0), + ), + axis=1, ) - X_new = X_new[2:, :] - y_new = y_new[2:] + X_new = X_new[2:-2, :] + y_new = y_new[2:-2] if subj in train_subjects: subj_label_train.extend([subj for i in range(X_new.shape[0])]) From c8d417bfdb250a51c8c82211ba20a2fe6090a772 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 18 Apr 2024 19:04:57 -0400 Subject: [PATCH 006/401] add dynamic_pred param --- simul_dFC/KNN_ML.py | 47 ++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py index 4de7f76..e2848f3 100644 --- a/simul_dFC/KNN_ML.py +++ b/simul_dFC/KNN_ML.py @@ -22,6 +22,7 @@ TASKS = ["task-pulse"] +dynamic_pred = "no" # 'past' or 'past_and_future' or 'no' (only current TR) normalize_dFC = True SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS) @@ -147,28 +148,30 @@ X_new = dFC_vecs y_new = task_presence.ravel() - # # concat current TR and two TR before of X_new to predict the current TR of y_new - # # ignore the edge case of the first two TRs - # X_new = np.concatenate( - # (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1 - # ) - # X_new = X_new[2:, :] - # y_new = y_new[2:] - - # concat current TR and two TR before and after of X_new to predict the current TR of y_new - # ignore the edge case of the first and last two TRs - X_new = np.concatenate( - ( - X_new, - np.roll(X_new, 1, axis=0), - np.roll(X_new, 2, axis=0), - np.roll(X_new, -1, axis=0), - np.roll(X_new, -2, axis=0), - ), - axis=1, - ) - X_new = X_new[2:-2, :] - y_new = y_new[2:-2] + if dynamic_pred == "past": + # concat current TR and two TR before of X_new to predict the current TR of y_new + # ignore the edge case of the first two TRs + X_new = np.concatenate( + (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1 + ) + X_new = X_new[2:, :] + y_new = y_new[2:] + + elif dynamic_pred == "past_and_future": + # concat current TR and two TR before and after of X_new to predict the current TR of y_new + # ignore the edge case of the first and last two TRs + X_new = np.concatenate( + ( + X_new, + np.roll(X_new, 1, axis=0), + np.roll(X_new, 2, axis=0), + np.roll(X_new, -1, axis=0), + np.roll(X_new, -2, axis=0), + ), + axis=1, + ) + X_new = X_new[2:-2, :] + y_new = y_new[2:-2] if subj in train_subjects: subj_label_train.extend([subj for i in range(X_new.shape[0])]) From 67e176d78737281b5c2c50028fd1bb8f0f913dd2 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 19 Apr 2024 18:08:51 -0400 Subject: [PATCH 007/401] correct make_pipeline KNN_ML --- simul_dFC/KNN_ML.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py index e2848f3..44eca0a 100644 --- a/simul_dFC/KNN_ML.py +++ b/simul_dFC/KNN_ML.py @@ -201,20 +201,25 @@ print("task presence classification ...") + # find num_PCs pca = PCA(svd_solver="full", whiten=False) pca.fit(X_train) num_PCs = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1 - # create new a knn model - knn = KNeighborsClassifier() + # create a pipeline with a knn model to find the best n_neighbors + knn = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(), + ) # create a dictionary of all values we want to test for n_neighbors - param_grid = {"n_neighbors": np.arange(1, 30)} + param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} # use gridsearch to test all values for n_neighbors knn_gscv = GridSearchCV(knn, param_grid, cv=5) # fit model to data knn_gscv.fit(X_train, y_condition_train) - n_neighbors = knn_gscv.best_params_["n_neighbors"] + n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] neigh = make_pipeline( StandardScaler(), From 472d427ed139fd9bcd5ec0d3561f5dad120adcc1 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 23 Apr 2024 13:08:38 -0400 Subject: [PATCH 008/401] find events columns idx --- pydfc/task_utils.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index 3fe2870..a24b3cf 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -6,6 +6,8 @@ @author: Mohammad Torabi """ +import warnings + import matplotlib.pyplot as plt import numpy as np from nilearn import glm @@ -25,15 +27,21 @@ def events_time_to_labels( It assumes that the first time point is TR0 which corresponds to [0 sec, TR sec] interval. oversampling: number of samples per TR_mri to improve the time resolution of tasks """ + + # find which column is the "onset" in the first row + onset_idx = np.where(events[0, :] == "onset")[0][0] + duration_idx = np.where(events[0, :] == "duration")[0][0] + trial_type_idx = np.where(events[0, :] == "trial_type")[0][0] + assert ( - events[0, 0] == "onset" - ), "The first column of the events file should be the onset!" + events[0, onset_idx] == "onset" + ), "Something went wrong with the events file! The onset column was not found!" assert ( - events[0, 1] == "duration" - ), "The second column of the events file should be the duration!" + events[0, duration_idx] == "duration" + ), "Something went wrong with the events file! The duration column was not found!" assert ( - events[0, 2] == "trial_type" - ), "The third column of the events file should be the trial type!" + events[0, trial_type_idx] == "trial_type" + ), "Something went wrong with the events file! The trial_type column was not found!" Fs = float(1 / TR_mri) * oversampling num_time_task = int(num_time_mri * oversampling) @@ -43,12 +51,16 @@ def events_time_to_labels( if i == 0: continue - if events[i, 2] in event_types: - start_time = float(events[i, 0]) - end_time = float(events[i, 0]) + float(events[i, 1]) + if events[i, trial_type_idx] in event_types: + if events[i, trial_type_idx] == "rest": + warnings.warn("trial types should not include 'rest'") + start_time = float(events[i, onset_idx]) + end_time = float(events[i, onset_idx]) + float(events[i, duration_idx]) start_timepoint = int(np.rint(start_time * Fs)) end_timepoint = int(np.rint(end_time * Fs)) - event_labels[start_timepoint:end_timepoint] = event_types.index(events[i, 2]) + event_labels[start_timepoint:end_timepoint] = event_types.index( + events[i, trial_type_idx] + ) if return_0_1: event_labels = np.multiply(event_labels != 0, 1) From daa3e6bf89cc4cac0c3f8cd7b16cbbc250746b51 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 26 Apr 2024 15:40:30 -0400 Subject: [PATCH 009/401] add simul_utils --- pydfc/__init__.py | 2 + pydfc/simul_utils.py | 331 +++++++++++++++++++++++++++++++ simul_dFC/task_data_simulator.py | 231 ++++++++++----------- task_dFC/nifti_to_roi_signal.py | 304 +++++++++++++++++----------- 4 files changed, 625 insertions(+), 243 deletions(-) create mode 100644 pydfc/simul_utils.py diff --git a/pydfc/__init__.py b/pydfc/__init__.py index d5ac722..c793222 100644 --- a/pydfc/__init__.py +++ b/pydfc/__init__.py @@ -27,4 +27,6 @@ "dfc_methods", "dfc_utils", "comparison", + "task_utils", + "simul_utils", ] diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py new file mode 100644 index 0000000..109aac2 --- /dev/null +++ b/pydfc/simul_utils.py @@ -0,0 +1,331 @@ +# -*- coding: utf-8 -*- +""" +Functions to facilitate dFC simulation. + +Created on April 25 2024 +@author: Mohammad Torabi +""" + +import re +from calendar import c + +import numpy as np +from matplotlib.pylab import rand +from tvb.simulator.lab import * + +from pydfc import TIME_SERIES, task_utils + +################################# Simulation Functions #################################### + + +def create_random_stimulus_weights(stimulated_regions_list, n_regions=76): + """ + Create random stimulus weights for the stimulated regions. + """ + rand_weighting = [ + np.random.normal(loc=2.0 ** (-1 * (2 + i)), scale=0.1 * (2.0**-2)) + for i in range(len(stimulated_regions_list)) + ] + + # configure stimulus spatial pattern + weighting = np.zeros((n_regions,)) + weighting[stimulated_regions_list] = rand_weighting + + return weighting + + +def create_stimulus( + onset, + task_duration, + task_block_duration, + conn, + region_weighting, +): + """ + Create a stimulus pattern for the task. + """ + # temporal profile + eqn_t = equations.PulseTrain() + eqn_t.parameters["onset"] = onset * 1e3 # ms + eqn_t.parameters["tau"] = task_duration * 1e3 # ms + eqn_t.parameters["T"] = task_block_duration * 1e3 # ms + + stimulus = patterns.StimuliRegion( + temporal=eqn_t, connectivity=conn, weight=region_weighting + ) + + return stimulus + + +def simulate_task_BOLD( + onset_time, + task_duration, + task_block_duration, + sim_length, + BOLD_period, + TAVG_period, + global_conn_coupling_coef=0.0126, + D=0.001, + conn_speed=1.0, + dt=0.5, + drop_initial_time=False, +): + """ + Simulate BOLD signal for a task. + """ + # randomize some parameters for each subjects + onset = np.random.normal(loc=onset_time, scale=0.5) # seconds + global_conn_coupling = np.random.normal(loc=global_conn_coupling_coef, scale=0.0075) + conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed) + ################################# Initialize Simulation #################################### + conn = connectivity.Connectivity.from_file() + conn.speed = np.array([conn_speed_rand]) + + # configure stimulus spatial pattern + weighting = create_random_stimulus_weights( + stimulated_regions_list=[0, 7, 13, 33, 42], n_regions=76 + ) + + stimulus = create_stimulus( + onset=onset, + task_duration=task_duration, + task_block_duration=task_block_duration, + conn=conn, + region_weighting=weighting, + ) + + ################################# Run Simulation #################################### + + # set the global coupling strength + # you can switch between deterministic (without noise) and stochastic integration (with noise) + sim = simulator.Simulator( + model=models.Generic2dOscillator(a=np.array([0.5])), + connectivity=conn, + coupling=coupling.Linear(a=np.array([global_conn_coupling])), + # integrator=integrators.HeunDeterministic(dt=dt), + integrator=integrators.HeunStochastic( + dt=dt, noise=noise.Additive(nsig=np.array([D])) + ), + monitors=( + monitors.TemporalAverage(period=TAVG_period), + monitors.Bold(period=BOLD_period, hrf_kernel=equations.MixtureOfGammas()), + monitors.ProgressLogger(period=10e3), + ), + stimulus=stimulus, + simulation_length=sim_length, + ).configure() + + (tavg_time, tavg_data), (bold_time, bold_data), _ = sim.run() + + if drop_initial_time: + # truncate the first 10 seconds of the simulation + # to avoid transient effects + truncate_time = 10e3 # in m sec + bold_truncate_idx = int(truncate_time / BOLD_period) + bold_time = bold_time[bold_truncate_idx:] + bold_data = bold_data[bold_truncate_idx:] + tavg_truncate_idx = int(truncate_time / TAVG_period) + tavg_time = tavg_time[tavg_truncate_idx:] + tavg_data = tavg_data[tavg_truncate_idx:] + + centres_locs = conn.centres + region_labels = list(conn.region_labels) + TR_mri = BOLD_period * 1e-3 # in seconds + + bold_data = bold_data[:, 0, :, 0] + # change time_series.shape to (roi, time) + bold_data = bold_data.T + + TAVG_data = tavg_data[:, 0, :, 0] + # change time_series.shape to (roi, time) + TAVG_data = TAVG_data.T + + return ( + bold_data, + bold_time, + region_labels, + centres_locs, + TR_mri, + TAVG_data, + tavg_time, + TAVG_period, + ) + + +def create_simul_task_info( + num_time_mri, + TR_mri, + task, + onset, + task_duration, + task_block_duration, + sim_length, + oversampling=50, +): + """ + Create a dictionary containing the task data for simulation. + + Parameters + ---------- + num_time_mri : int + Number of time points in the BOLD signal. + TR_mri : float + The repetition time of the MRI. + task : str + The task name. + onset : float + The onset time of the task. + task_duration : float + The duration of the task. + task_block_duration : float + The duration of the task block. + sim_length : float + The length of the simulation. + oversampling : int, optional + The oversampling factor. The default is 50. + generate more samples per TR than the func data to have a + better event_labels time resolution + """ + ################################# EXTRACT TASK LABELS ######################### + events = [] + event_types = ["rest", "task"] + + # using onset, task_duration, task_block_duration to create the events + events.append(["onset", "duration", "trial_type"]) + t = onset + while t < sim_length: + events.append([t, task_duration, "task"]) + t += task_block_duration + events = np.array(events) + + event_labels, Fs_task = task_utils.events_time_to_labels( + events=events, + TR_mri=TR_mri, + num_time_mri=num_time_mri, + event_types=event_types, + oversampling=oversampling, + return_0_1=False, + ) + # fill task labels with 0 (rest) and 1 (task's index, here only 1 task is used) + task_labels = np.multiply(event_labels != 0, 1) + ################################# SAVE ################################# + # save the ROI time series and task data + task_data = { + "task": task, + "task_labels": task_labels, + "event_labels": event_labels, + "event_types": event_types, + "events": events, + "Fs_task": Fs_task, + "TR_mri": TR_mri, + "num_time_mri": num_time_mri, + } + + return task_data + + +def simulate_task_BOLD_TS( + subj_id, + task, + onset_time, + task_duration, + task_block_duration, + sim_length, + BOLD_period, + TAVG_period, + global_conn_coupling_coef=0.0126, + D=0.001, + conn_speed=1.0, + dt=0.5, + drop_initial_time=False, +): + """ + Simulate BOLD signal for a task and return a TIME_SERIES object. + """ + bold_data, bold_time, region_labels, centres_locs, TR_mri, _, _, _ = ( + simulate_task_BOLD( + onset_time=onset_time, + task_duration=task_duration, + task_block_duration=task_block_duration, + sim_length=sim_length, + BOLD_period=BOLD_period, + TAVG_period=TAVG_period, + global_conn_coupling_coef=global_conn_coupling_coef, + D=D, + conn_speed=conn_speed, + dt=dt, + drop_initial_time=drop_initial_time, + ) + ) + time_series = TIME_SERIES( + data=bold_data, + subj_id=subj_id, + Fs=1 / TR_mri, + locs=centres_locs, + node_labels=region_labels, + TS_name=f"BOLD_{subj_id}_{task}", + session_name=task, + ) + num_time_mri = time_series.n_time + task_data = create_simul_task_info( + num_time_mri=num_time_mri, + TR_mri=TR_mri, + task=task, + onset=onset_time, + task_duration=task_duration, + task_block_duration=task_block_duration, + sim_length=sim_length, + ) + + return time_series, task_data + + +def simulate_task(subj_id, task_info): + """ + Simulate task-based BOLD signal for a subject. + + Parameters + ---------- + subj_id : int + The subject ID. + task_info : dict + A dictionary containing the task information below: + - task_name: str + The name of the task. + - onset_time: float + The onset time of the task. + - task_duration: float + The duration of the task. + - task_block_duration: float + The duration of the task block. + - sim_length: float + The length of the simulation. + - BOLD_period: float + The BOLD period. + - TAVG_period: float + The TAVG period. + - global_conn_coupling_coef: float + The global connectivity coupling coefficient. + - D: float + The noise parameter. + - conn_speed: float + The connectivity speed. + - dt: float + The simulation time step. + """ + time_series, task_data = simulate_task( + subj_id=subj_id, + task=task_info["task_name"], + onset_time=task_info["onset_time"], + task_duration=task_info["task_duration"], + task_block_duration=task_info["task_block_duration"], + sim_length=task_info["sim_length"], + BOLD_period=task_info["BOLD_period"], + TAVG_period=task_info["TAVG_period"], + global_conn_coupling_coef=task_info["global_conn_coupling_coef"], + D=task_info["D"], + conn_speed=task_info["conn_speed"], + dt=task_info["dt"], + ) + + return time_series, task_data diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 98fa832..f6bc3a9 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -10,7 +10,7 @@ import numpy as np from tvb.simulator.lab import * -from pydfc import TIME_SERIES, task_utils +from pydfc import simul_utils warnings.simplefilter("ignore") @@ -20,20 +20,17 @@ ################################# Parameters #################################### # data paths -dataset = "ds000002" +dataset = "ds000001" # main_root = f"./DATA/{dataset}" # for local main_root = f"../../DATA/task-based/simulated/{dataset}" # for server output_root = f"{main_root}/derivatives/ROI_timeseries" -task = "task-pulse" - # simulation parameters sim_length = 250e3 # in m sec onset_time = 20.0 # in seconds -task_duration = 12.0 # in seconds -task_block_duration = 30.0 # in seconds BOLD_period = 500 # in m sec TAVG_period = 1.0 # in m sec +global_conn_coupling_coef = 0.0126 conn_speed = 1.0 D = 0.001 # noise dispersion dt = 0.5 # integration step @@ -47,130 +44,110 @@ print(f"subject-level simulation started running ... for subject: {subj_id} ...") -# randomize some parameters for each subjects -onset = np.random.normal(loc=onset_time, scale=0.5) # seconds -global_conn_coupling = np.random.normal(loc=0.0126, scale=0.0075) -rand_weighting = np.array( - [ - np.random.normal(loc=2.0**-2, scale=0.1 * (2.0**-2)), - np.random.normal(loc=2.0**-3, scale=0.1 * (2.0**-3)), - np.random.normal(loc=2.0**-4, scale=0.1 * (2.0**-4)), - np.random.normal(loc=2.0**-5, scale=0.1 * (2.0**-5)), - np.random.normal(loc=2.0**-6, scale=0.1 * (2.0**-6)), - ] -) -conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed) -################################# Initialize Simulation #################################### -conn = connectivity.Connectivity.from_file() -conn.speed = np.array([conn_speed_rand]) - -# configure stimulus spatial pattern -weighting = np.zeros((76,)) -weighting[[0, 7, 13, 33, 42]] = rand_weighting -# weighting[[0, 7, 13, 33, 42]] = numpy.array([2.0 ** -2, 2.0 ** -3, 2.0 ** -4, 2.0 ** -5, 2.0 ** -6]) - -# temporal profile -eqn_t = equations.PulseTrain() -eqn_t.parameters["onset"] = onset * 1e3 # ms -eqn_t.parameters["tau"] = task_duration * 1e3 # ms -eqn_t.parameters["T"] = task_block_duration * 1e3 # ms - -stimulus = patterns.StimuliRegion(temporal=eqn_t, connectivity=conn, weight=weighting) - -################################# Run Simulation #################################### - -# set the global coupling strength -# you can switch between deterministic (without noise) and stochastic integration (with noise) -sim = simulator.Simulator( - model=models.Generic2dOscillator(a=np.array([0.5])), - connectivity=conn, - coupling=coupling.Linear(a=np.array([global_conn_coupling])), - # integrator=integrators.HeunDeterministic(dt=dt), - integrator=integrators.HeunStochastic( - dt=dt, noise=noise.Additive(nsig=np.array([D])) - ), - monitors=( - monitors.TemporalAverage(period=TAVG_period), - monitors.Bold(period=BOLD_period, hrf_kernel=equations.MixtureOfGammas()), - monitors.ProgressLogger(period=10e3), - ), - stimulus=stimulus, - simulation_length=sim_length, -).configure() - -(tavg_time, tavg_data), (bold_time, bold_data), _ = sim.run() - -# # truncate the first 10 seconds of the simulation -# # to avoid transient effects -# truncate_time = 10e3 # in m sec -# bold_truncate_idx = int(truncate_time / BOLD_period) -# bold_time = bold_time[bold_truncate_idx:] -# bold_data = bold_data[bold_truncate_idx:] -# tavg_truncate_idx = int(truncate_time / TAVG_period) -# tavg_time = tavg_time[tavg_truncate_idx:] -# tavg_data = tavg_data[tavg_truncate_idx:] - -centres_locs = conn.centres -region_labels = list(conn.region_labels) -TR_mri = BOLD_period * 1e-3 # in seconds - -bold_data = bold_data[:, 0, :, 0] -# change time_series.shape to (roi, time) -bold_data = bold_data.T - -time_series = TIME_SERIES( - data=bold_data, - subj_id=subj_id, - Fs=1 / TR_mri, - locs=centres_locs, - node_labels=region_labels, - TS_name=f"BOLD_{subj_id}_{task}", - session_name=task, -) -num_time_mri = time_series.n_time -################################# EXTRACT TASK LABELS ######################### -oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution +all_task_info = { + "task-midFreqMidRest": { + "task_name": "task-midFreqMidRest", + "onset_time": onset_time, + "task_duration": 12.0, + "task_block_duration": 30.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-lowFreqLongRest": { + "task_name": "task-lowFreqLongRest", + "onset_time": onset_time, + "task_duration": 20.0, + "task_block_duration": 40.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-lowFreqShortRest": { + "task_name": "task-lowFreqShortRest", + "onset_time": onset_time, + "task_duration": 20.0, + "task_block_duration": 25.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-lowFreqShortTask": { + "task_name": "task-lowFreqShortTask", + "onset_time": onset_time, + "task_duration": 5.0, + "task_block_duration": 30.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-highFreqLongRest": { + "task_name": "task-highFreqLongRest", + "onset_time": onset_time, + "task_duration": 1.0, + "task_block_duration": 5.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-highFreqShortRest": { + "task_name": "task-highFreqShortRest", + "onset_time": onset_time, + "task_duration": 4.0, + "task_block_duration": 5.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-midFreqMidRestNoisy": { + "task_name": "task-midFreqMidRestNoisy", + "onset_time": onset_time, + "task_duration": 12.0, + "task_block_duration": 30.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D * 100, + "conn_speed": conn_speed, + "dt": dt, + }, +} -events = [] -event_types = ["rest", "task"] -TASKS = [task] +for task in all_task_info: -# using onset, task_duration, task_block_duration to create the events -events.append(["onset", "duration", "trial_type"]) -t = onset -while t < sim_length: - events.append([t, task_duration, "task"]) - t += task_block_duration -events = np.array(events) + time_series, task_data = simul_utils.simulate_task(subj_id, all_task_info[task]) -event_labels, Fs_task = task_utils.events_time_to_labels( - events=events, - TR_mri=TR_mri, - num_time_mri=num_time_mri, - event_types=event_types, - oversampling=oversampling, - return_0_1=False, -) -# fill task labels with 0 (rest) and 1 (task's index, here only 1 task is used) -task_labels = np.multiply(event_labels != 0, 1) -################################# SAVE ################################# -# save the ROI time series and task data -task_data = { - "task": task, - "task_labels": task_labels, - "task_types": TASKS, - "event_labels": event_labels, - "event_types": event_types, - "events": events, - "Fs_task": Fs_task, - "TR_mri": TR_mri, - "num_time_mri": num_time_mri, -} -subj_folder = f"{subj_id}_{task}" -if not os.path.exists(f"{output_root}/{subj_folder}/"): - os.makedirs(f"{output_root}/{subj_folder}/") -np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series) -np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data) + # save the time series and task data + subj_folder = f"{subj_id}_{task}" + if not os.path.exists(f"{output_root}/{subj_folder}/"): + os.makedirs(f"{output_root}/{subj_folder}/") + np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series) + np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data) print("****************** DONE ******************") #################################################################################### diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 1e52cb8..0e20700 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -1,3 +1,4 @@ +import argparse import json import os import warnings @@ -6,130 +7,201 @@ from pydfc import data_loader, task_utils -warnings.simplefilter("ignore") - -################################# Parameters ################################# -# data paths -# main_root = '../../DATA/ds002785' # for local -main_root = "../../../DATA/task-based/openneuro/ds002785" # for server -fmriprep_root = f"{main_root}/derivatives/fmriprep" -output_root = f"{main_root}/derivatives/ROI_timeseries" - -bold_suffix = "_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz" - -# for consistency we use 0 for resting state -TASKS = [ - "task-restingstate", - "task-anticipation", - "task-emomatching", - "task-faces", - "task-gstroop", - "task-workingmemory", -] - -# find all subjects -ALL_SUBJs = os.listdir(fmriprep_root) -ALL_SUBJs = [i for i in ALL_SUBJs if ("sub-" in i) and (not ".html" in i)] -ALL_SUBJs.sort() - -# pick the subject -job_id = int(os.getenv("SGE_TASK_ID")) -subj = ALL_SUBJs[job_id - 1] # SGE_TASK_ID starts from 1 not 0 - -print( - f"subject-level ROI signal extraction CODE started running ... for subject: {subj} ..." -) -################################# FIND THE FUNC FILE ################################# -for task in TASKS: +# warnings.simplefilter("ignore") + + +################################# FUNCTIONS ################################# +def run_roi_signal_extraction( + subj, task, main_root, fmriprep_root, bold_suffix, output_root +): + """ + Extract ROI signals and task labels for a given subject and task + """ # find the func file for this subject and task ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") ALL_TASK_FILES = [ - i for i in ALL_TASK_FILES if (bold_suffix in i) and (task in i) + file_i + for file_i in ALL_TASK_FILES + if (bold_suffix in file_i) and (task in file_i) ] # only keep the denoised files? or use the original files? - # print(ALL_TASK_FILES) - if not len(ALL_TASK_FILES) == 1: + + if not len(ALL_TASK_FILES) >= 1: # if the func file is not found, exclude the subject print("Func file not found for " + subj + " " + task) - continue - fmriprep_file = f"{fmriprep_root}/{subj}/func/{ALL_TASK_FILES[0]}" - info_file = ( - f"{main_root}/{subj}/func/{ALL_TASK_FILES[0].replace(bold_suffix, '_bold.json')}" - ) + return + + # there might be multiple runs for the same task + # check if "_run" exists in all the task file names + if all(["_run" in task_file for task_file in ALL_TASK_FILES]): + multi_run_flag = True + # find all the runs + RUNS = [ + task_file[ + task_file.find("_run") + + 1 : task_file.find("_run") + + 1 + + task_file[task_file.find("_run") + 1 :].find("_") + ] + for task_file in ALL_TASK_FILES + ] + # sort + RUNS.sort() + print(f"Found multiple runs for {subj} {task}: {RUNS}") + else: + multi_run_flag = False + RUNS = [""] + + for run in RUNS: + task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0] + nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}" + info_file = ( + f"{main_root}/bids/{subj}/func/{task_file.replace(bold_suffix, '_bold.json')}" + ) + + ################################# LOAD JSON INFO ######################### + # Opening JSON file as a dictionary + f = open(info_file) + acquisition_data = json.load(f) + f.close() + TR_mri = acquisition_data["RepetitionTime"] + ################################# EXTRACT TIME SERIES ######################### + # extract ROI signals and convert to TIME_SERIES object + time_series = data_loader.nifti2timeseries( + nifti_file=nifti_file, + n_rois=100, + Fs=1 / TR_mri, + subj_id=subj, + confound_strategy="no_motion", + standardize="zscore", + TS_name="BOLD", + session=task, + ) + num_time_mri = time_series.n_time + ################################# EXTRACT TASK LABELS ######################### + oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution + if task == "task-restingstate": + events = [] + event_types = ["rest"] + event_labels = np.zeros((int(num_time_mri * oversampling), 1)) + task_labels = np.zeros((int(num_time_mri * oversampling), 1)) + Fs_task = float(1 / TR_mri) * oversampling + else: + task_events_root = f"{main_root}/bids/{subj}/func" + ALL_EVENTS_FILES = os.listdir(task_events_root) + ALL_EVENTS_FILES = [ + file_i + for file_i in ALL_EVENTS_FILES + if (subj in file_i) + and (task in file_i) + and (run in file_i) + and ("events.tsv" in file_i) + ] + if not len(ALL_EVENTS_FILES) == 1: + # if the events file is not found, exclude the subject + print(f"Events file not found for {subj} {task} {run}") + return + # load the tsv events file + events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}" + events = np.genfromtxt(events_file, delimiter="\t", dtype=str) + # get the event labels + # check that "rest" does not already exist in the event types + if any( + ["rest" in event_type for event_type in list(np.unique(events[1:, 2]))] + ): + raise ValueError("Event types should not include 'rest'") + event_types = ["rest"] + list(np.unique(events[1:, 2])) + event_labels, Fs_task = task_utils.events_time_to_labels( + events=events, + TR_mri=TR_mri, + num_time_mri=num_time_mri, + event_types=event_types, + oversampling=oversampling, + return_0_1=False, + ) + # fill task labels with task's index + task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index( + task + ) + ################################# SAVE ################################# + # save the ROI time series and task data + task_data = { + "task": task, + "task_labels": task_labels, + "task_types": TASKS, + "event_labels": event_labels, + "event_types": event_types, + "events": events, + "Fs_task": Fs_task, + "TR_mri": TR_mri, + "num_time_mri": num_time_mri, + } + if multi_run_flag: + output_file_prefix = f"{subj}_{task}_{run}" + else: + output_file_prefix = f"{subj}_{task}" + if not os.path.exists(f"{output_root}/{subj}/"): + os.makedirs(f"{output_root}/{subj}/") + np.save(f"{output_root}/{subj}/{output_file_prefix}_time-series.npy", time_series) + np.save(f"{output_root}/{subj}/{output_file_prefix}_task-data.npy", task_data) + + +######################################################################################## + +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to convert nifti files to ROI signals for a given participant. + """ + + parser = argparse.ArgumentParser(description=HELPTEXT) + + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + parser.add_argument("--participant_id", type=str, help="participant id") - ################################# LOAD JSON INFO ######################### - # Opening JSON file as a dictionary - f = open(info_file) - acquisition_data = json.load(f) - f.close() - TR_mri = acquisition_data["RepetitionTime"] - ################################# EXTRACT TIME SERIES ######################### - # extract ROI signals and convert to TIME_SERIES object - time_series = data_loader.nifti2timeseries( - nifti_file=fmriprep_file, - n_rois=100, - Fs=1 / TR_mri, - subj_id=subj, - confound_strategy="no_motion", - standardize="zscore", - TS_name="BOLD", - session=task, + args = parser.parse_args() + + dataset_info_file = args.dataset_info + participant_id = args.participant_id + + # Read global configs + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + + print( + f"subject-level ROI signal extraction CODE started running ... for subject: {participant_id} ..." ) - num_time_mri = time_series.n_time - ################################# EXTRACT TASK LABELS ######################### - oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution - if task == "task-restingstate": - events = [] - event_types = ["rest"] - event_labels = np.zeros((int(num_time_mri * oversampling), 1)) - task_labels = np.zeros((int(num_time_mri * oversampling), 1)) - Fs_task = float(1 / TR_mri) * oversampling + + TASKS = dataset_info["TASKS"] + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) else: - task_events_root = f"{main_root}/{subj}/func/" - ALL_EVENTS_FILES = os.listdir(task_events_root) - ALL_EVENTS_FILES = [ - i - for i in ALL_EVENTS_FILES - if (subj in i) and (task in i) and ("events.tsv" in i) - ] - if not len(ALL_EVENTS_FILES) == 1: - # if the events file is not found, exclude the subject - print("Events file not found for " + subj + " " + task) - continue - # load the tsv events file - events_file = task_events_root + ALL_EVENTS_FILES[0] - events = np.genfromtxt(events_file, delimiter="\t", dtype=str) - # get the task labels - event_types = ["rest"] + list(np.unique(events[1:, 2])) - event_labels, Fs_task = task_utils.events_time_to_labels( - events=events, - TR_mri=TR_mri, - num_time_mri=num_time_mri, - event_types=event_types, - oversampling=oversampling, - return_0_1=False, + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["fmriprep_root"]: + fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root) + else: + fmriprep_root = dataset_info["fmriprep_root"] + + if "{main_root}" in dataset_info["output_root"]: + output_root = dataset_info["output_root"].replace("{main_root}", main_root) + else: + output_root = dataset_info["output_root"] + + for task in TASKS: + run_roi_signal_extraction( + subj=participant_id, + task=task, + main_root=main_root, + fmriprep_root=fmriprep_root, + bold_suffix=dataset_info["bold_suffix"], + output_root=output_root, ) - # fill task labels with 0 (rest) and k (task's index) - task_labels = np.multiply(event_labels != 0, TASKS.index(task)) - ################################# SAVE ################################# - # save the ROI time series and task data - task_data = { - "task": task, - "task_labels": task_labels, - "task_types": TASKS, - "event_labels": event_labels, - "event_types": event_types, - "events": events, - "Fs_task": Fs_task, - "TR_mri": TR_mri, - "num_time_mri": num_time_mri, - } - subj_folder = f"{subj}_{task}" - if not os.path.exists(f"{output_root}/{subj_folder}/"): - os.makedirs(f"{output_root}/{subj_folder}/") - np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series) - np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data) - -print( - f"subject-level ROI signal extraction CODE finished running ... for subject: {subj} ..." -) + + print( + f"subject-level ROI signal extraction CODE finished running ... for subject: {participant_id} ..." + ) + #################################################################### From cd48fdfbd787e6f1b577588edb0e646c9e5b4c32 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 26 Apr 2024 15:41:47 -0400 Subject: [PATCH 010/401] rearrange nifti_to_roi --- task_dFC/nifti_to_roi_signal.py | 304 ++++++++++++++++++++------------ 1 file changed, 188 insertions(+), 116 deletions(-) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 1e52cb8..0e20700 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -1,3 +1,4 @@ +import argparse import json import os import warnings @@ -6,130 +7,201 @@ from pydfc import data_loader, task_utils -warnings.simplefilter("ignore") - -################################# Parameters ################################# -# data paths -# main_root = '../../DATA/ds002785' # for local -main_root = "../../../DATA/task-based/openneuro/ds002785" # for server -fmriprep_root = f"{main_root}/derivatives/fmriprep" -output_root = f"{main_root}/derivatives/ROI_timeseries" - -bold_suffix = "_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz" - -# for consistency we use 0 for resting state -TASKS = [ - "task-restingstate", - "task-anticipation", - "task-emomatching", - "task-faces", - "task-gstroop", - "task-workingmemory", -] - -# find all subjects -ALL_SUBJs = os.listdir(fmriprep_root) -ALL_SUBJs = [i for i in ALL_SUBJs if ("sub-" in i) and (not ".html" in i)] -ALL_SUBJs.sort() - -# pick the subject -job_id = int(os.getenv("SGE_TASK_ID")) -subj = ALL_SUBJs[job_id - 1] # SGE_TASK_ID starts from 1 not 0 - -print( - f"subject-level ROI signal extraction CODE started running ... for subject: {subj} ..." -) -################################# FIND THE FUNC FILE ################################# -for task in TASKS: +# warnings.simplefilter("ignore") + + +################################# FUNCTIONS ################################# +def run_roi_signal_extraction( + subj, task, main_root, fmriprep_root, bold_suffix, output_root +): + """ + Extract ROI signals and task labels for a given subject and task + """ # find the func file for this subject and task ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") ALL_TASK_FILES = [ - i for i in ALL_TASK_FILES if (bold_suffix in i) and (task in i) + file_i + for file_i in ALL_TASK_FILES + if (bold_suffix in file_i) and (task in file_i) ] # only keep the denoised files? or use the original files? - # print(ALL_TASK_FILES) - if not len(ALL_TASK_FILES) == 1: + + if not len(ALL_TASK_FILES) >= 1: # if the func file is not found, exclude the subject print("Func file not found for " + subj + " " + task) - continue - fmriprep_file = f"{fmriprep_root}/{subj}/func/{ALL_TASK_FILES[0]}" - info_file = ( - f"{main_root}/{subj}/func/{ALL_TASK_FILES[0].replace(bold_suffix, '_bold.json')}" - ) + return + + # there might be multiple runs for the same task + # check if "_run" exists in all the task file names + if all(["_run" in task_file for task_file in ALL_TASK_FILES]): + multi_run_flag = True + # find all the runs + RUNS = [ + task_file[ + task_file.find("_run") + + 1 : task_file.find("_run") + + 1 + + task_file[task_file.find("_run") + 1 :].find("_") + ] + for task_file in ALL_TASK_FILES + ] + # sort + RUNS.sort() + print(f"Found multiple runs for {subj} {task}: {RUNS}") + else: + multi_run_flag = False + RUNS = [""] + + for run in RUNS: + task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0] + nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}" + info_file = ( + f"{main_root}/bids/{subj}/func/{task_file.replace(bold_suffix, '_bold.json')}" + ) + + ################################# LOAD JSON INFO ######################### + # Opening JSON file as a dictionary + f = open(info_file) + acquisition_data = json.load(f) + f.close() + TR_mri = acquisition_data["RepetitionTime"] + ################################# EXTRACT TIME SERIES ######################### + # extract ROI signals and convert to TIME_SERIES object + time_series = data_loader.nifti2timeseries( + nifti_file=nifti_file, + n_rois=100, + Fs=1 / TR_mri, + subj_id=subj, + confound_strategy="no_motion", + standardize="zscore", + TS_name="BOLD", + session=task, + ) + num_time_mri = time_series.n_time + ################################# EXTRACT TASK LABELS ######################### + oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution + if task == "task-restingstate": + events = [] + event_types = ["rest"] + event_labels = np.zeros((int(num_time_mri * oversampling), 1)) + task_labels = np.zeros((int(num_time_mri * oversampling), 1)) + Fs_task = float(1 / TR_mri) * oversampling + else: + task_events_root = f"{main_root}/bids/{subj}/func" + ALL_EVENTS_FILES = os.listdir(task_events_root) + ALL_EVENTS_FILES = [ + file_i + for file_i in ALL_EVENTS_FILES + if (subj in file_i) + and (task in file_i) + and (run in file_i) + and ("events.tsv" in file_i) + ] + if not len(ALL_EVENTS_FILES) == 1: + # if the events file is not found, exclude the subject + print(f"Events file not found for {subj} {task} {run}") + return + # load the tsv events file + events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}" + events = np.genfromtxt(events_file, delimiter="\t", dtype=str) + # get the event labels + # check that "rest" does not already exist in the event types + if any( + ["rest" in event_type for event_type in list(np.unique(events[1:, 2]))] + ): + raise ValueError("Event types should not include 'rest'") + event_types = ["rest"] + list(np.unique(events[1:, 2])) + event_labels, Fs_task = task_utils.events_time_to_labels( + events=events, + TR_mri=TR_mri, + num_time_mri=num_time_mri, + event_types=event_types, + oversampling=oversampling, + return_0_1=False, + ) + # fill task labels with task's index + task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index( + task + ) + ################################# SAVE ################################# + # save the ROI time series and task data + task_data = { + "task": task, + "task_labels": task_labels, + "task_types": TASKS, + "event_labels": event_labels, + "event_types": event_types, + "events": events, + "Fs_task": Fs_task, + "TR_mri": TR_mri, + "num_time_mri": num_time_mri, + } + if multi_run_flag: + output_file_prefix = f"{subj}_{task}_{run}" + else: + output_file_prefix = f"{subj}_{task}" + if not os.path.exists(f"{output_root}/{subj}/"): + os.makedirs(f"{output_root}/{subj}/") + np.save(f"{output_root}/{subj}/{output_file_prefix}_time-series.npy", time_series) + np.save(f"{output_root}/{subj}/{output_file_prefix}_task-data.npy", task_data) + + +######################################################################################## + +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to convert nifti files to ROI signals for a given participant. + """ + + parser = argparse.ArgumentParser(description=HELPTEXT) + + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + parser.add_argument("--participant_id", type=str, help="participant id") - ################################# LOAD JSON INFO ######################### - # Opening JSON file as a dictionary - f = open(info_file) - acquisition_data = json.load(f) - f.close() - TR_mri = acquisition_data["RepetitionTime"] - ################################# EXTRACT TIME SERIES ######################### - # extract ROI signals and convert to TIME_SERIES object - time_series = data_loader.nifti2timeseries( - nifti_file=fmriprep_file, - n_rois=100, - Fs=1 / TR_mri, - subj_id=subj, - confound_strategy="no_motion", - standardize="zscore", - TS_name="BOLD", - session=task, + args = parser.parse_args() + + dataset_info_file = args.dataset_info + participant_id = args.participant_id + + # Read global configs + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + + print( + f"subject-level ROI signal extraction CODE started running ... for subject: {participant_id} ..." ) - num_time_mri = time_series.n_time - ################################# EXTRACT TASK LABELS ######################### - oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution - if task == "task-restingstate": - events = [] - event_types = ["rest"] - event_labels = np.zeros((int(num_time_mri * oversampling), 1)) - task_labels = np.zeros((int(num_time_mri * oversampling), 1)) - Fs_task = float(1 / TR_mri) * oversampling + + TASKS = dataset_info["TASKS"] + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) else: - task_events_root = f"{main_root}/{subj}/func/" - ALL_EVENTS_FILES = os.listdir(task_events_root) - ALL_EVENTS_FILES = [ - i - for i in ALL_EVENTS_FILES - if (subj in i) and (task in i) and ("events.tsv" in i) - ] - if not len(ALL_EVENTS_FILES) == 1: - # if the events file is not found, exclude the subject - print("Events file not found for " + subj + " " + task) - continue - # load the tsv events file - events_file = task_events_root + ALL_EVENTS_FILES[0] - events = np.genfromtxt(events_file, delimiter="\t", dtype=str) - # get the task labels - event_types = ["rest"] + list(np.unique(events[1:, 2])) - event_labels, Fs_task = task_utils.events_time_to_labels( - events=events, - TR_mri=TR_mri, - num_time_mri=num_time_mri, - event_types=event_types, - oversampling=oversampling, - return_0_1=False, + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["fmriprep_root"]: + fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root) + else: + fmriprep_root = dataset_info["fmriprep_root"] + + if "{main_root}" in dataset_info["output_root"]: + output_root = dataset_info["output_root"].replace("{main_root}", main_root) + else: + output_root = dataset_info["output_root"] + + for task in TASKS: + run_roi_signal_extraction( + subj=participant_id, + task=task, + main_root=main_root, + fmriprep_root=fmriprep_root, + bold_suffix=dataset_info["bold_suffix"], + output_root=output_root, ) - # fill task labels with 0 (rest) and k (task's index) - task_labels = np.multiply(event_labels != 0, TASKS.index(task)) - ################################# SAVE ################################# - # save the ROI time series and task data - task_data = { - "task": task, - "task_labels": task_labels, - "task_types": TASKS, - "event_labels": event_labels, - "event_types": event_types, - "events": events, - "Fs_task": Fs_task, - "TR_mri": TR_mri, - "num_time_mri": num_time_mri, - } - subj_folder = f"{subj}_{task}" - if not os.path.exists(f"{output_root}/{subj_folder}/"): - os.makedirs(f"{output_root}/{subj_folder}/") - np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series) - np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data) - -print( - f"subject-level ROI signal extraction CODE finished running ... for subject: {subj} ..." -) + + print( + f"subject-level ROI signal extraction CODE finished running ... for subject: {participant_id} ..." + ) + #################################################################### From 8c6a5166099b2918df0e89efda142c67180c8e6b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 26 Apr 2024 15:45:03 -0400 Subject: [PATCH 011/401] update output save --- simul_dFC/task_data_simulator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index f6bc3a9..d4d5cd2 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -143,11 +143,11 @@ time_series, task_data = simul_utils.simulate_task(subj_id, all_task_info[task]) # save the time series and task data - subj_folder = f"{subj_id}_{task}" - if not os.path.exists(f"{output_root}/{subj_folder}/"): - os.makedirs(f"{output_root}/{subj_folder}/") - np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series) - np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data) + output_file_prefix = f"{subj_id}_{task}" + if not os.path.exists(f"{output_root}/{subj_id}/"): + os.makedirs(f"{output_root}/{subj_id}/") + np.save(f"{output_root}/{subj_id}/{output_file_prefix}_time-series.npy", time_series) + np.save(f"{output_root}/{subj_id}/{output_file_prefix}_task-data.npy", task_data) print("****************** DONE ******************") #################################################################################### From b1ec4b45fda8bc4545d26531dcac703f5fb8ecce Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 26 Apr 2024 15:46:28 -0400 Subject: [PATCH 012/401] fix simul_utils --- pydfc/simul_utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py index 109aac2..51bf728 100644 --- a/pydfc/simul_utils.py +++ b/pydfc/simul_utils.py @@ -6,11 +6,7 @@ @author: Mohammad Torabi """ -import re -from calendar import c - import numpy as np -from matplotlib.pylab import rand from tvb.simulator.lab import * from pydfc import TIME_SERIES, task_utils From d0087a0e99cdc6560ff089133bf83a1ff54d7515 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 26 Apr 2024 20:36:24 -0400 Subject: [PATCH 013/401] fix bug --- pydfc/simul_utils.py | 6 +++--- simul_dFC/task_data_simulator.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py index 51bf728..d716498 100644 --- a/pydfc/simul_utils.py +++ b/pydfc/simul_utils.py @@ -182,7 +182,7 @@ def create_simul_task_info( generate more samples per TR than the func data to have a better event_labels time resolution """ - ################################# EXTRACT TASK LABELS ######################### + ####################### EXTRACT TASK LABELS ####################### events = [] event_types = ["rest", "task"] @@ -276,7 +276,7 @@ def simulate_task_BOLD_TS( return time_series, task_data -def simulate_task(subj_id, task_info): +def simulate_task_data(subj_id, task_info): """ Simulate task-based BOLD signal for a subject. @@ -309,7 +309,7 @@ def simulate_task(subj_id, task_info): - dt: float The simulation time step. """ - time_series, task_data = simulate_task( + time_series, task_data = simulate_task_BOLD_TS( subj_id=subj_id, task=task_info["task_name"], onset_time=task_info["onset_time"], diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index d4d5cd2..7823932 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -140,7 +140,7 @@ for task in all_task_info: - time_series, task_data = simul_utils.simulate_task(subj_id, all_task_info[task]) + time_series, task_data = simul_utils.simulate_task_data(subj_id, all_task_info[task]) # save the time series and task data output_file_prefix = f"{subj_id}_{task}" From ee39b25e5b9af3d669b199064625a4c8fa2f508a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 29 Apr 2024 13:43:49 -0400 Subject: [PATCH 014/401] update FCS_estimate --- simul_dFC/FCS_estimate.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py index caf2aa0..5ccd06e 100644 --- a/simul_dFC/FCS_estimate.py +++ b/simul_dFC/FCS_estimate.py @@ -14,21 +14,27 @@ ################################# Parameters ################################# # data paths -dataset = "ds000001" +dataset = "ds000002" # main_root = f"./DATA/{dataset}" # for local -main_root = f"../../DATA/task-based/simulated/{dataset}" # for server +main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}" # for server roi_root = f"{main_root}/derivatives/ROI_timeseries" output_root = f"{main_root}/derivatives/fitted_MEASURES" -# for consistency we use 0 for resting state -TASKS = ["task-pulse"] +TASKS = [ + "task-midFreqMidRest", + "task-lowFreqLongRest", + "task-lowFreqShortRest", + "task-lowFreqShortTask", + "task-highFreqLongRest", + "task-highFreqShortRest", + "task-midFreqMidRestNoisy", +] -# job_id = int(os.getenv("SGE_TASK_ID")) -# TASK_id = job_id-1 # SGE_TASK_ID starts from 1 not 0 -# if TASK_id >= len(TASKS): -# print("TASK_id out of TASKS") -# exit() -TASK_id = 0 +job_id = int(os.getenv("SGE_TASK_ID")) +TASK_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 +if TASK_id >= len(TASKS): + print("TASK_id out of TASKS") + exit() task = TASKS[TASK_id] ###### MEASUREMENT PARAMETERS ###### From 435186469f9fbbeb8e06a0f0aee22c5edd1b7309 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 29 Apr 2024 21:52:34 -0400 Subject: [PATCH 015/401] update FCS_estimate --- simul_dFC/FCS_estimate.py | 26 +++---- task_dFC/FCS_estimate.py | 126 +++++++++++++++++--------------- task_dFC/nifti_to_roi_signal.py | 9 ++- 3 files changed, 86 insertions(+), 75 deletions(-) diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py index 5ccd06e..0fd7653 100644 --- a/simul_dFC/FCS_estimate.py +++ b/simul_dFC/FCS_estimate.py @@ -65,8 +65,8 @@ "session": task, # Hyper Parameters "normalization": True, - "num_subj": None, # None or 200? - "num_time_point": None, # None or set? + "num_subj": None, + "num_time_point": None, } ###### HYPER PARAMETERS ALTERNATIVE ###### @@ -106,15 +106,12 @@ ################################# LOAD DATA ################################# BOLD = data_loader.load_TS( - data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None + data_root=roi_root, + file_name="{subj_id}_{task}_time-series.npy", + SESSIONs=task, + subj_id2load=None, + task=task, ) - -################################# Visualize BOLD ################################# - -# for session in BOLD: -# BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)), -# save_image=False, output_root=None) - ################################ Measures of dFC ################################# MA = MultiAnalysis( @@ -136,15 +133,14 @@ if measure.is_state_based: measure.estimate_FCS(time_series=BOLD) - # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD) print("FCS estimation done.") # Save - if not os.path.exists(f"{output_root}/{task}"): - os.makedirs(f"{output_root}/{task}") - np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure) + if not os.path.exists(f"{output_root}"): + os.makedirs(f"{output_root}") + np.save(f"{output_root}/MEASURE_{task}_{MEASURE_id}.npy", measure) print(f"Measurement required {time.time() - tic:0.3f} seconds.") -np.save(f"{output_root}/{task}/multi_analysis.npy", MA) +np.save(f"{output_root}/multi-analysis_{task}.npy", MA) ################################################################################# diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index de4d738..d171085 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -14,20 +14,21 @@ ################################# Parameters ################################# # data paths -# main_root = '../../DATA/ds002785/' # for local -main_root = "../../../DATA/task-based/openneuro/ds002785" # for server +dataset = "ds003242" +# main_root = f"../../DATA/{dataset}" # for local +main_root = f"/data/origami/dFC/DATA/task-based/openneuro/{dataset}" # for server roi_root = f"{main_root}/derivatives/ROI_timeseries" output_root = f"{main_root}/derivatives/fitted_MEASURES" # for consistency we use 0 for resting state -TASKS = [ - "task-restingstate", - "task-anticipation", - "task-emomatching", - "task-faces", - "task-gstroop", - "task-workingmemory", -] +TASKS = ["task-CIC", "task-midloc"] + +# default RUNS = None +RUNS = None +RUNS = { + "task-CIC": ["run-001", "run-002", "run-003", "run-004", "run-005", "run-006"], + "task-midloc": ["run-001"], +} job_id = int(os.getenv("SGE_TASK_ID")) TASK_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 @@ -42,7 +43,7 @@ params_methods = { # Sliding Parameters - "W": 44, + "W": 12, "n_overlap": 1.0, "sw_method": "pear_corr", "tapered_window": True, @@ -54,8 +55,8 @@ "hmm_iter": 20, "dhmm_obs_state_ratio": 16 / 24, # State Parameters - "n_states": 12, - "n_subj_clstrs": 20, + "n_states": 5, + "n_subj_clstrs": 10, # Parallelization Parameters "n_jobs": 2, "verbose": 0, @@ -64,8 +65,8 @@ "session": task, # Hyper Parameters "normalization": True, - "num_subj": None, # None or 216? - "num_time_point": None, # None or set? + "num_subj": None, + "num_time_point": None, } ###### HYPER PARAMETERS ALTERNATIVE ###### @@ -102,48 +103,57 @@ "backend": "loky", } -################################# LOAD DATA ################################# - -BOLD = data_loader.load_TS( - data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None -) - -################################# Visualize BOLD ################################# - -# for session in BOLD: -# BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)), -# save_image=False, output_root=None) - -################################ Measures of dFC ################################# - -MA = MultiAnalysis( - analysis_name=f"task-based-dFC-ds002785-{task}", **params_multi_analysis -) - -MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams) - -tic = time.time() -print("Measurement Started ...") - -################################# estimate FCS ################################# - -for MEASURE_id, measure in enumerate(MEASURES_lst): - - print("MEASURE: " + measure.measure_name) - print("FCS estimation started...") - - if measure.is_state_based: - measure.estimate_FCS(time_series=BOLD) - - # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD) - print("FCS estimation done.") - - # Save - if not os.path.exists(f"{output_root}/{task}"): - os.makedirs(f"{output_root}/{task}") - np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure) - -print(f"Measurement required {time.time() - tic:0.3f} seconds.") -np.save(f"{output_root}/{task}/multi_analysis.npy", MA) +if RUNS is None: + RUNS = {task: [None]} +for run in RUNS[task]: + if run is None: + print(f"TASK: {task} started ...") + file_suffix = f"{task}" + BOLD_file_name = "{subj_id}_{task}_time-series.npy" + else: + print(f"TASK: {task}, RUN: {run} started ...") + file_suffix = f"{task}_{run}" + BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy" + ################################# LOAD DATA ################################# + BOLD = data_loader.load_TS( + data_root=roi_root, + file_name=BOLD_file_name, + SESSIONs=task, + subj_id2load=None, + task=task, + run=run, + ) + ################################ Measures of dFC ################################# + + MA = MultiAnalysis( + analysis_name=f"task-based-dFC-{dataset}-{file_suffix}", **params_multi_analysis + ) + + MEASURES_lst = MA.measures_initializer( + MEASURES_name_lst, params_methods, alter_hparams + ) + + tic = time.time() + print("Measurement Started ...") + + ################################# estimate FCS ################################# + + for MEASURE_id, measure in enumerate(MEASURES_lst): + + print("MEASURE: " + measure.measure_name) + print("FCS estimation started...") + + if measure.is_state_based: + measure.estimate_FCS(time_series=BOLD) + + print("FCS estimation done.") + + # Save + if not os.path.exists(f"{output_root}"): + os.makedirs(f"{output_root}") + np.save(f"{output_root}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure) + + print(f"Measurement required {time.time() - tic:0.3f} seconds.") + np.save(f"{output_root}/multi-analysis_{file_suffix}.npy", MA) ################################################################################# diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 0e20700..59c8792 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -18,7 +18,12 @@ def run_roi_signal_extraction( Extract ROI signals and task labels for a given subject and task """ # find the func file for this subject and task - ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") + try: + ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") + except FileNotFoundError: + print(f"Subject {subj} not found in {fmriprep_root}") + return + ALL_TASK_FILES = [ file_i for file_i in ALL_TASK_FILES @@ -27,7 +32,7 @@ def run_roi_signal_extraction( if not len(ALL_TASK_FILES) >= 1: # if the func file is not found, exclude the subject - print("Func file not found for " + subj + " " + task) + print(f"Func file not found for {subj} {task}") return # there might be multiple runs for the same task From 0c6158ff032465e3db54a8b6d49c14e62a8ea8af Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 1 May 2024 17:53:19 -0400 Subject: [PATCH 016/401] reorganize dFC_assess --- task_dFC/dFC_assessment.py | 257 ++++++++++++++++++++++---------- task_dFC/nifti_to_roi_signal.py | 6 +- 2 files changed, 179 insertions(+), 84 deletions(-) diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py index a381f95..84564c4 100644 --- a/task_dFC/dFC_assessment.py +++ b/task_dFC/dFC_assessment.py @@ -1,3 +1,5 @@ +import argparse +import json import os import time import warnings @@ -12,97 +14,190 @@ os.environ["NUMEXPR_NUM_THREADS"] = "16" os.environ["OMP_NUM_THREADS"] = "16" -################################# Parameters ################################# - -# Data parameters -# main_root = '../../DATA/ds002785/' # for local -main_root = "../../../DATA/task-based/openneuro/ds002785/" # for server - -# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate -# you can set the new roi root and data load parameters here: -roi_root = f"{main_root}/derivatives/ROI_timeseries" -fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES" -output_root = f"{main_root}/derivatives/dFC_assessed" - -# for consistency we use 0 for resting state -TASKS = [ - "task-restingstate", - "task-anticipation", - "task-emomatching", - "task-faces", - "task-gstroop", - "task-workingmemory", -] - -# find all subjects across all tasks -SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS) - -# job_id selects the subject -job_id = int(os.getenv("SGE_TASK_ID")) -if job_id > len(SUBJECTS): - print("job_id > len(SUBJECTS)") - exit() -subj_id = SUBJECTS[job_id - 1] # SGE_TASK_ID starts from 1 not 0 - -for task in TASKS: - - MA = np.load( - f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE" - ).item() - - # check if the subject has this task - SUBJECTS_with_this_task = data_loader.find_subj_list( - data_root=roi_root, sessions=[task] - ) - if not subj_id in SUBJECTS_with_this_task: - print(f"subject {subj_id} not in the list of subjects with task {task}") - continue - - ################################# LOAD FIT MEASURES ################################# - - ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/") - ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i] - ALL_RECORDS.sort() - MEASURES_fit_lst = list() - for s in ALL_RECORDS: - fit_measure = np.load( - f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE" +################################# Functions ################################# + + +def run_dFC_assess( + subj_id, + task, + roi_root, + fitted_measures_root, + output_root, +): + + # check if the subject has this task in roi_root + if not os.path.exists(f"{roi_root}/{subj_id}"): + print(f"Subject {subj_id} not found in {roi_root}") + return + + ALL_ROI_FILES = os.listdir(f"{roi_root}/{subj_id}/") + ALL_ROI_FILES = [ + roi_file + for roi_file in ALL_ROI_FILES + if ("_time-series.npy" in roi_file) and (task in roi_file) + ] + ALL_ROI_FILES.sort() + + # check if "_run" exists in all the task file names + if all(["_run" in roi_file for roi_file in ALL_ROI_FILES]): + # find all the runs + RUNS = [ + roi_file[ + roi_file.find("_run") + + 1 : roi_file.find("_run") + + 1 + + roi_file[roi_file.find("_run") + 1 :].find("_") + ] + for roi_file in ALL_ROI_FILES + ] + # sort + RUNS.sort() + print(f"Found multiple runs for {subj_id} {task}: {RUNS}") + else: + RUNS = [None] + + for run in RUNS: + + # check if the subject has this task and run in roi_root + if run is None: + file_suffix = f"{task}" + if not os.path.exists( + f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy" + ): + print(f"Time series file not found for {subj_id} {task}") + continue + else: + print( + f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..." + ) + BOLD_file_name = "{subj_id}_{task}_time-series.npy" + else: + file_suffix = f"{task}_{run}" + if not os.path.exists( + f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy" + ): + print(f"Time series file not found for {subj_id} {task} {run}") + continue + else: + print( + f"subject-level dFC assessment CODE started running ... for task {task} and {run} of subject {subj_id} ..." + ) + BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy" + + ################################# LOAD FIT MEASURES ################################# + + MA = np.load( + f"{fitted_measures_root}/multi-analysis_{file_suffix}.npy", + allow_pickle="TRUE", ).item() - MEASURES_fit_lst.append(fit_measure) - MA.set_MEASURES_fit_lst(MEASURES_fit_lst) - print("fitted MEASURES loaded ...") - ################################# LOAD DATA ################################# + ALL_RECORDS = os.listdir(f"{fitted_measures_root}/") + ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)] + ALL_RECORDS.sort() + MEASURES_fit_lst = list() + for s in ALL_RECORDS: + fit_measure = np.load( + f"{fitted_measures_root}/{s}", allow_pickle="TRUE" + ).item() + MEASURES_fit_lst.append(fit_measure) + MA.set_MEASURES_fit_lst(MEASURES_fit_lst) + print("fitted MEASURES are loaded ...") - print( - f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..." - ) + ################################# LOAD DATA ################################# - BOLD = data_loader.load_TS( - data_root=roi_root, - file_name="time_series.npy", - SESSIONs=[task], - subj_id2load=subj_id, - ) + BOLD = data_loader.load_TS( + data_root=roi_root, + file_name=BOLD_file_name, + SESSIONs=task, + subj_id2load=subj_id, + task=task, + run=run, + ) + + ################################# dFC ASSESSMENT ################################# + + tic = time.time() + print("Measurement Started ...") + + print("dFC estimation started...") + dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD) + print("dFC estimation done.") + + print(f"Measurement required {time.time() - tic:0.3f} seconds.") + + ################################# SAVE DATA ################################# + + folder = f"{output_root}/{subj_id}" + if not os.path.exists(folder): + os.makedirs(folder) - ################################# dFC ASSESSMENT ################################# + for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]): + np.save(f"{folder}/dFC_{file_suffix}_{dFC_id}.npy", dFC) - tic = time.time() - print("Measurement Started ...") - print("dFC estimation started...") - dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD) - print("dFC estimation done.") +####################################################################################### + +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to assess dFC for a given participant. + """ + + parser = argparse.ArgumentParser(description=HELPTEXT) + + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + parser.add_argument("--participant_id", type=str, help="participant id") + + args = parser.parse_args() - print(f"Measurement required {time.time() - tic:0.3f} seconds.") + dataset_info_file = args.dataset_info + participant_id = args.participant_id - ################################# SAVE DATA ################################# + # Read global configs + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) - folder = f"{output_root}/{task}/{subj_id}" - if not os.path.exists(folder): - os.makedirs(folder) + print( + f"subject-level dFC assessment CODE started running ... for subject: {participant_id} ..." + ) + + TASKS = dataset_info["TASKS"] + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) + else: + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["roi_root"]: + roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) + else: + roi_root = dataset_info["roi_root"] + + if "{main_root}" in dataset_info["fitted_measures_root"]: + fitted_measures_root = dataset_info["fitted_measures_root"].replace( + "{main_root}", main_root + ) + else: + fitted_measures_root = dataset_info["fitted_measures_root"] + + if "{main_root}" in dataset_info["dFC_root"]: + output_root = dataset_info["dFC_root"].replace("{main_root}", main_root) + else: + output_root = dataset_info["dFC_root"] + + for task in TASKS: + run_dFC_assess( + subj_id=participant_id, + task=task, + roi_root=roi_root, + fitted_measures_root=fitted_measures_root, + output_root=output_root, + ) - for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]): - np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC) + print( + f"subject-level dFC assessment CODE finished running ... for subject: {participant_id} ..." + ) ####################################################################################### diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 59c8792..7ee8870 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -190,10 +190,10 @@ def run_roi_signal_extraction( else: fmriprep_root = dataset_info["fmriprep_root"] - if "{main_root}" in dataset_info["output_root"]: - output_root = dataset_info["output_root"].replace("{main_root}", main_root) + if "{main_root}" in dataset_info["roi_root"]: + output_root = dataset_info["roi_root"].replace("{main_root}", main_root) else: - output_root = dataset_info["output_root"] + output_root = dataset_info["roi_root"] for task in TASKS: run_roi_signal_extraction( From 8a1b0ba7a69665566f36451e8365851f5151ba08 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 17 May 2024 23:05:05 -0400 Subject: [PATCH 017/401] Change dFC_assessment --- task_dFC/dFC_assessment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py index 84564c4..cb8993f 100644 --- a/task_dFC/dFC_assessment.py +++ b/task_dFC/dFC_assessment.py @@ -120,7 +120,7 @@ def run_dFC_assess( print("Measurement Started ...") print("dFC estimation started...") - dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD) + dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD) print("dFC estimation done.") print(f"Measurement required {time.time() - tic:0.3f} seconds.") From 0e6e1b79b7d4d9e2d04d7f6e620784297b5b33ce Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Sat, 18 May 2024 11:20:40 -0400 Subject: [PATCH 018/401] minor fix --- task_dFC/dFC_assessment.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py index cb8993f..b3068b0 100644 --- a/task_dFC/dFC_assessment.py +++ b/task_dFC/dFC_assessment.py @@ -38,6 +38,11 @@ def run_dFC_assess( ] ALL_ROI_FILES.sort() + # if there are no files for this task, return + if not len(ALL_ROI_FILES) >= 1: + print(f"No time series files found for {subj_id} {task}") + return + # check if "_run" exists in all the task file names if all(["_run" in roi_file for roi_file in ALL_ROI_FILES]): # find all the runs From f38d22a161d4f3f7f92a68055673ed614b681d4b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Sun, 26 May 2024 16:26:56 -0400 Subject: [PATCH 019/401] update ML.py --- simul_dFC/KNN_ML.py | 640 ++++++++++++++++++++++++++++---------------- task_dFC/ML.py | 461 +++++++++++++++++++++++++++++++ 2 files changed, 875 insertions(+), 226 deletions(-) create mode 100644 task_dFC/ML.py diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py index 44eca0a..c1b60cc 100644 --- a/simul_dFC/KNN_ML.py +++ b/simul_dFC/KNN_ML.py @@ -1,9 +1,11 @@ +import argparse +import json import os import numpy as np from sklearn.decomposition import PCA from sklearn.metrics import balanced_accuracy_score -from sklearn.model_selection import GridSearchCV, train_test_split +from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler @@ -11,128 +13,54 @@ from pydfc import DFC, data_loader, task_utils from pydfc.dfc_utils import dFC_mat2vec, rank_norm -# Data parameters -dataset = "ds000001" - -# main_root = f"./DATA/{dataset}" # for local -main_root = f"../../DATA/task-based/simulated/{dataset}" # for server -roi_root = f"{main_root}/derivatives/ROI_timeseries" -dFC_root = f"{main_root}/derivatives/dFC_assessed" -output_root = "./ML_RESULTS_KNN_classify" - -TASKS = ["task-pulse"] - -dynamic_pred = "no" # 'past' or 'past_and_future' or 'no' (only current TR) -normalize_dFC = True - -SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS) - -# randomly select 80% of the subjects for training and 20% for testing using numpy.random.choice -train_subjects = np.random.choice(SUBJECTS, int(0.8 * len(SUBJECTS)), replace=False) -test_subjects = np.setdiff1d(SUBJECTS, train_subjects) - -print( - f"number of train_subjects: {len(train_subjects)} and test_subjects: {len(test_subjects)}" -) - - -################## TASK FEATURES ################## - -task_features = { - "task": list(), - "relative_task_on": list(), - "avg_task_duration": list(), - "var_task_duration": list(), - "avg_rest_duration": list(), - "var_rest_duration": list(), - "num_of_transitions": list(), - "relative_transition_freq": list(), -} -for task_id, task in enumerate(TASKS): - - if task == "task-restingstate": - continue - - for subj in SUBJECTS: - # event data - task_data = np.load( - f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE" - ).item() - Fs_task = task_data["Fs_task"] - TR_task = 1 / Fs_task - - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=TR_task, - TR_mri=task_data["TR_mri"], - binary=True, - ) - - relative_task_on = task_utils.relative_task_on(task_presence) - # task duration - avg_task_duration, var_task_duration = task_utils.task_duration( - task_presence, task_data["TR_mri"] - ) - # rest duration - avg_rest_duration, var_rest_duration = task_utils.rest_duration( - task_presence, task_data["TR_mri"] - ) - # freq of transitions - num_of_transitions, relative_transition_freq = task_utils.transition_freq( - task_presence - ) - - task_features["task"].append(task) - task_features["relative_task_on"].append(relative_task_on) - task_features["avg_task_duration"].append(avg_task_duration) - task_features["var_task_duration"].append(var_task_duration) - task_features["avg_rest_duration"].append(avg_rest_duration) - task_features["var_rest_duration"].append(var_rest_duration) - task_features["num_of_transitions"].append(num_of_transitions) - task_features["relative_transition_freq"].append(relative_transition_freq) - - -################## TASK PRESENCE CLASSIFICATION ################## -ML_scores = { - "subj_id": list(), - "group": list(), - "task": list(), - "dFC method": list(), - "KNN accuracy": list(), -} -for dFC_id in range(0, 7): - print(f"=================== dFC {dFC_id} ===================") - - ML_RESULT = {} +####################################################################################### + + +def find_available_subjects(dFC_root, task, dFC_id=None): + """ + Find the subjects that have dFC results for the given task and dFC_id (method). + """ + SUBJECTS = list() + ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/") + ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder] + ALL_SUBJ_FOLDERS.sort() + for subj_folder in ALL_SUBJ_FOLDERS: + ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") + ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file] + if dFC_id is not None: + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file + ] + ALL_DFC_FILES.sort() + if len(ALL_DFC_FILES) > 0: + SUBJECTS.append(subj_folder) + return SUBJECTS + + +def extract_task_features(TASKS, roi_root, output_root): + """ + Extract task features from the event data.""" + task_features = { + "task": list(), + "relative_task_on": list(), + "avg_task_duration": list(), + "var_task_duration": list(), + "avg_rest_duration": list(), + "var_rest_duration": list(), + "num_of_transitions": list(), + "relative_transition_freq": list(), + } for task_id, task in enumerate(TASKS): - print(f"=============== {task} ===============") if task == "task-restingstate": continue - X_train = None - X_test = None - y_condition_train = None - y_condition_test = None - subj_label_train = list() - subj_label_test = list() + SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task) for subj in SUBJECTS: - - dFC = np.load( - f"{dFC_root}/{task}/{subj}/dFC_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - - dFC_mat = dFC.get_dFC_mat() - TR_array = dFC.TR_array - if normalize_dFC: - dFC_mat = rank_norm(dFC_mat) - - dFC_vecs = dFC_mat2vec(dFC_mat) - # event data task_data = np.load( - f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE" + f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" ).item() Fs_task = task_data["Fs_task"] TR_task = 1 / Fs_task @@ -141,132 +69,392 @@ event_labels=task_data["event_labels"], TR_task=TR_task, TR_mri=task_data["TR_mri"], - TR_array=TR_array, binary=True, ) - X_new = dFC_vecs - y_new = task_presence.ravel() - - if dynamic_pred == "past": - # concat current TR and two TR before of X_new to predict the current TR of y_new - # ignore the edge case of the first two TRs - X_new = np.concatenate( - (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1 - ) - X_new = X_new[2:, :] - y_new = y_new[2:] - - elif dynamic_pred == "past_and_future": - # concat current TR and two TR before and after of X_new to predict the current TR of y_new - # ignore the edge case of the first and last two TRs - X_new = np.concatenate( - ( - X_new, - np.roll(X_new, 1, axis=0), - np.roll(X_new, 2, axis=0), - np.roll(X_new, -1, axis=0), - np.roll(X_new, -2, axis=0), - ), - axis=1, - ) - X_new = X_new[2:-2, :] - y_new = y_new[2:-2] - - if subj in train_subjects: - subj_label_train.extend([subj for i in range(X_new.shape[0])]) - if X_train is None and y_condition_train is None: - X_train = X_new - y_condition_train = y_new - else: - X_train = np.concatenate((X_train, X_new), axis=0) - y_condition_train = np.concatenate((y_condition_train, y_new), axis=0) - elif subj in test_subjects: - subj_label_test.extend([subj for i in range(X_new.shape[0])]) - if X_test is None and y_condition_test is None: - X_test = X_new - y_condition_test = y_new - else: - X_test = np.concatenate((X_test, X_new), axis=0) - y_condition_test = np.concatenate((y_condition_test, y_new), axis=0) - - print( - X_train.shape, X_test.shape, y_condition_train.shape, y_condition_test.shape - ) - subj_label_train = np.array(subj_label_train) - subj_label_test = np.array(subj_label_test) - print(subj_label_train.shape, subj_label_test.shape) + relative_task_on = task_utils.relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = task_utils.task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = task_utils.rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = task_utils.transition_freq( + task_presence + ) - # task presence classification + task_features["task"].append(task) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append(relative_transition_freq) - print("task presence classification ...") + folder = f"{output_root}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/task_features_KNN_classify.npy", task_features) + + +def dFC_feature_extraction_subj_lvl( + dFC, + task_data, + dynamic_pred="no", + normalize_dFC=True, +): + """ + Extract features and target for task presence classification + for a single subject. + """ + # dFC features + dFC_mat = dFC.get_dFC_mat() + TR_array = dFC.TR_array + if normalize_dFC: + dFC_mat = rank_norm(dFC_mat) + dFC_vecs = dFC_mat2vec(dFC_mat) + + # event data + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=1 / task_data["Fs_task"], + TR_mri=task_data["TR_mri"], + TR_array=TR_array, + binary=True, + ) + + features = dFC_vecs + target = task_presence.ravel() + + if dynamic_pred == "past": + # concat current TR and two TR before of features to predict the current TR of target + # ignore the edge case of the first two TRs + features = np.concatenate( + (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1 + ) + features = features[2:, :] + target = target[2:] + elif dynamic_pred == "past_and_future": + # concat current TR and two TR before and after of features to predict the current TR of target + # ignore the edge case of the first and last two TRs + features = np.concatenate( + ( + features, + np.roll(features, 1, axis=0), + np.roll(features, 2, axis=0), + np.roll(features, -1, axis=0), + np.roll(features, -2, axis=0), + ), + axis=1, + ) + features = features[2:-2, :] + target = target[2:-2] + + return features, target + + +def dFC_feature_extraction( + task, + train_subjects, + test_subjects, + dFC_id, + roi_root, + dFC_root, + dynamic_pred="no", + normalize_dFC=True, +): + """ + Extract features and target for task presence classification + for all subjects. + """ + X_train = None + y_train = None + subj_label_train = list() + for subj in train_subjects: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() - # find num_PCs - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_train) - num_PCs = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1 + task_data = np.load( + f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() - # create a pipeline with a knn model to find the best n_neighbors - knn = make_pipeline( - StandardScaler(), - PCA(n_components=num_PCs), - KNeighborsClassifier(), + X_subj, y_subj = dFC_feature_extraction_subj_lvl( + dFC=dFC, + task_data=task_data, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, ) - # create a dictionary of all values we want to test for n_neighbors - param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} - # use gridsearch to test all values for n_neighbors - knn_gscv = GridSearchCV(knn, param_grid, cv=5) - # fit model to data - knn_gscv.fit(X_train, y_condition_train) - - n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] - - neigh = make_pipeline( - StandardScaler(), - PCA(n_components=num_PCs), - KNeighborsClassifier(n_neighbors=n_neighbors), - ).fit(X_train, y_condition_train) - - ML_RESULT[task] = { - "pca": pca, - "num_PCs": num_PCs, - "cv_results": knn_gscv.cv_results_, - "KNN": neigh, - "KNN train score": neigh.score(X_train, y_condition_train), - "KNN test score": neigh.score(X_test, y_condition_test), - } - - print( - f"KNN train score {dFC.measure.measure_name} {task}: {neigh.score(X_train, y_condition_train)}" + + subj_label_train.extend([subj for i in range(X_subj.shape[0])]) + if X_train is None and y_train is None: + X_train = X_subj + y_train = y_subj + else: + X_train = np.concatenate((X_train, X_subj), axis=0) + y_train = np.concatenate((y_train, y_subj), axis=0) + + X_test = None + y_test = None + subj_label_test = list() + for subj in test_subjects: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + + task_data = np.load( + f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + + X_subj, y_subj = dFC_feature_extraction_subj_lvl( + dFC=dFC, + task_data=task_data, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, ) - print( - f"KNN test score {dFC.measure.measure_name} {task}: {neigh.score(X_test, y_condition_test)}" + + subj_label_test.extend([subj for i in range(X_subj.shape[0])]) + if X_test is None and y_test is None: + X_test = X_subj + y_test = y_subj + else: + X_test = np.concatenate((X_test, X_subj), axis=0) + y_test = np.concatenate((y_test, y_subj), axis=0) + + print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) + subj_label_train = np.array(subj_label_train) + subj_label_test = np.array(subj_label_test) + + return ( + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + dFC.measure.measure_name, + ) + + +def task_presence_classification( + task, + dFC_id, + roi_root, + dFC_root, + dynamic_pred="no", + normalize_dFC=True, + train_test_ratio=0.8, + explained_var_threshold=0.95, +): + print(f"=============== {task} ===============") + + if task == "task-restingstate": + return + + SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id) + + # randomly select train_test_ratio of the subjects for training + # and rest for testing using numpy.random.choice + train_subjects = np.random.choice( + SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False + ) + test_subjects = np.setdiff1d(SUBJECTS, train_subjects) + print( + f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}" + ) + + X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = ( + dFC_feature_extraction( + task=task, + train_subjects=train_subjects, + test_subjects=test_subjects, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, ) + ) + + # task presence classification + + print("task presence classification ...") + + # find num_PCs + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_train) + num_PCs = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + + 1 + ) + + # create a pipeline with a knn model to find the best n_neighbors + knn = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(), + ) + # create a dictionary of all values we want to test for n_neighbors + param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} + # use gridsearch to test all values for n_neighbors + knn_gscv = GridSearchCV(knn, param_grid, cv=5) + # fit model to data + knn_gscv.fit(X_train, y_train) + + n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] + + neigh = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(n_neighbors=n_neighbors), + ).fit(X_train, y_train) + + ML_RESULT = { + "pca": pca, + "num_PCs": num_PCs, + "cv_results": knn_gscv.cv_results_, + "KNN": neigh, + "KNN train score": neigh.score(X_train, y_train), + "KNN test score": neigh.score(X_test, y_test), + } + + print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}") + print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}") + + # measure pred score on each subj + + ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "dFC method": list(), + "KNN accuracy": list(), + } + for subj in SUBJECTS: + ML_scores["subj_id"].append(subj) + if subj in train_subjects: + ML_scores["group"].append("train") + features = X_train[subj_label_train == subj, :] + target = y_train[subj_label_train == subj] + elif subj in test_subjects: + ML_scores["group"].append("test") + features = X_test[subj_label_test == subj, :] + target = y_test[subj_label_test == subj] + + pred = neigh.predict(features) + + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) + + ML_scores["task"].append(task) + ML_scores["dFC method"].append(measure_name) + + return ML_RESULT, ML_scores + + +def run_classification( + TASKS, + roi_root, + dFC_root, + output_root, + dynamic_pred="no", + normalize_dFC=True, +): + ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "dFC method": list(), + "KNN accuracy": list(), + } + for dFC_id in range(0, 7): + print(f"=================== dFC {dFC_id} ===================") + + ML_RESULT = {} + for task_id, task in enumerate(TASKS): + ML_RESULT_new, ML_scores_new = task_presence_classification( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + ML_RESULT[task] = ML_RESULT_new + for key in ML_scores: + ML_scores[key].extend(ML_scores_new[key]) - # measure pred score on each subj + folder = f"{output_root}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) - for subj in SUBJECTS: - ML_scores["subj_id"].append(subj) - if subj in train_subjects: - ML_scores["group"].append("train") - features = X_train[subj_label_train == subj, :] - target = y_condition_train[subj_label_train == subj] - elif subj in test_subjects: - ML_scores["group"].append("test") - features = X_test[subj_label_test == subj, :] - target = y_condition_test[subj_label_test == subj] + np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores) - pred = neigh.predict(features) - ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) +####################################################################################### - ML_scores["task"].append(task) - ML_scores["dFC method"].append(dFC.measure.measure_name) +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to apply Machine Learning on dFC results to predict task presence. + """ - folder = f"{output_root}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/ML_RESULT_{dFC.measure.measure_name}.npy", ML_RESULT) + parser = argparse.ArgumentParser(description=HELPTEXT) -np.save(f"{folder}/task_features_KNN_classify.npy", task_features) -np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores) + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + + args = parser.parse_args() + + dataset_info_file = args.dataset_info + + # Read global configs + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + + print("Task presence prediction started ...") + + TASKS = dataset_info["TASKS"] + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) + else: + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["roi_root"]: + roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) + else: + roi_root = dataset_info["roi_root"] + + if "{main_root}" in dataset_info["dFC_root"]: + dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root) + else: + dFC_root = dataset_info["dFC_root"] + + if "{main_root}" in dataset_info["ML_root"]: + ML_root = dataset_info["ML_root"].replace("{main_root}", main_root) + else: + ML_root = dataset_info["ML_root"] + + extract_task_features( + TASKS=TASKS, + roi_root=roi_root, + output_root=ML_root, + ) + run_classification( + TASKS=TASKS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + dynamic_pred="no", + normalize_dFC=True, + ) + + print("Task presence prediction CODE finished running.") + +####################################################################################### diff --git a/task_dFC/ML.py b/task_dFC/ML.py new file mode 100644 index 0000000..2e19811 --- /dev/null +++ b/task_dFC/ML.py @@ -0,0 +1,461 @@ +import argparse +import json +import os + +import numpy as np +from sklearn.decomposition import PCA +from sklearn.metrics import balanced_accuracy_score +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +from pydfc import DFC, data_loader, task_utils +from pydfc.dfc_utils import dFC_mat2vec, rank_norm + +####################################################################################### + + +def find_available_subjects(dFC_root, task, dFC_id=None): + """ + Find the subjects that have dFC results for the given task and dFC_id (method). + """ + SUBJECTS = list() + ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/") + ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder] + ALL_SUBJ_FOLDERS.sort() + for subj_folder in ALL_SUBJ_FOLDERS: + ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") + ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file] + if dFC_id is not None: + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file + ] + ALL_DFC_FILES.sort() + if len(ALL_DFC_FILES) > 0: + SUBJECTS.append(subj_folder) + return SUBJECTS + + +def extract_task_features(TASKS, roi_root, output_root): + """ + Extract task features from the event data.""" + task_features = { + "task": list(), + "relative_task_on": list(), + "avg_task_duration": list(), + "var_task_duration": list(), + "avg_rest_duration": list(), + "var_rest_duration": list(), + "num_of_transitions": list(), + "relative_transition_freq": list(), + } + for task_id, task in enumerate(TASKS): + + if task == "task-restingstate": + continue + + SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task) + + for subj in SUBJECTS: + # event data + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + ) + + relative_task_on = task_utils.relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = task_utils.task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = task_utils.rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = task_utils.transition_freq( + task_presence + ) + + task_features["task"].append(task) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append(relative_transition_freq) + + folder = f"{output_root}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/task_features.npy", task_features) + + +def dFC_feature_extraction_subj_lvl( + dFC, + task_data, + dynamic_pred="no", + normalize_dFC=True, +): + """ + Extract features and target for task presence classification + for a single subject. + dynamic_pred: "no", "past", "past_and_future" + """ + # dFC features + dFC_mat = dFC.get_dFC_mat() + TR_array = dFC.TR_array + if normalize_dFC: + dFC_mat = rank_norm(dFC_mat) + dFC_vecs = dFC_mat2vec(dFC_mat) + + # event data + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=1 / task_data["Fs_task"], + TR_mri=task_data["TR_mri"], + TR_array=TR_array, + binary=True, + ) + + features = dFC_vecs + target = task_presence.ravel() + + if dynamic_pred == "past": + # concat current TR and two TR before of features to predict the current TR of target + # ignore the edge case of the first two TRs + features = np.concatenate( + (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1 + ) + features = features[2:, :] + target = target[2:] + elif dynamic_pred == "past_and_future": + # concat current TR and two TR before and after of features to predict the current TR of target + # ignore the edge case of the first and last two TRs + features = np.concatenate( + ( + features, + np.roll(features, 1, axis=0), + np.roll(features, 2, axis=0), + np.roll(features, -1, axis=0), + np.roll(features, -2, axis=0), + ), + axis=1, + ) + features = features[2:-2, :] + target = target[2:-2] + + return features, target + + +def dFC_feature_extraction( + task, + train_subjects, + test_subjects, + dFC_id, + roi_root, + dFC_root, + dynamic_pred="no", + normalize_dFC=True, +): + """ + Extract features and target for task presence classification + for all subjects. + """ + X_train = None + y_train = None + subj_label_train = list() + for subj in train_subjects: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + + X_subj, y_subj = dFC_feature_extraction_subj_lvl( + dFC=dFC, + task_data=task_data, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + + subj_label_train.extend([subj for i in range(X_subj.shape[0])]) + if X_train is None and y_train is None: + X_train = X_subj + y_train = y_subj + else: + X_train = np.concatenate((X_train, X_subj), axis=0) + y_train = np.concatenate((y_train, y_subj), axis=0) + + X_test = None + y_test = None + subj_label_test = list() + for subj in test_subjects: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + + X_subj, y_subj = dFC_feature_extraction_subj_lvl( + dFC=dFC, + task_data=task_data, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + + subj_label_test.extend([subj for i in range(X_subj.shape[0])]) + if X_test is None and y_test is None: + X_test = X_subj + y_test = y_subj + else: + X_test = np.concatenate((X_test, X_subj), axis=0) + y_test = np.concatenate((y_test, y_subj), axis=0) + + print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) + subj_label_train = np.array(subj_label_train) + subj_label_test = np.array(subj_label_test) + + return ( + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + dFC.measure.measure_name, + ) + + +def task_presence_classification( + task, + dFC_id, + roi_root, + dFC_root, + dynamic_pred="no", + normalize_dFC=True, + train_test_ratio=0.8, + explained_var_threshold=0.95, +): + print(f"=============== {task} ===============") + + if task == "task-restingstate": + return + + SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id) + + # randomly select train_test_ratio of the subjects for training + # and rest for testing using numpy.random.choice + train_subjects = np.random.choice( + SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False + ) + test_subjects = np.setdiff1d(SUBJECTS, train_subjects) + print( + f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}" + ) + + X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = ( + dFC_feature_extraction( + task=task, + train_subjects=train_subjects, + test_subjects=test_subjects, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + ) + + # task presence classification + + print("task presence classification ...") + + # find num_PCs + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_train) + num_PCs = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + + 1 + ) + + # create a pipeline with a knn model to find the best n_neighbors + knn = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(), + ) + # create a dictionary of all values we want to test for n_neighbors + param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} + # use gridsearch to test all values for n_neighbors + knn_gscv = GridSearchCV(knn, param_grid, cv=5) + # fit model to data + knn_gscv.fit(X_train, y_train) + + n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] + + neigh = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(n_neighbors=n_neighbors), + ).fit(X_train, y_train) + + ML_RESULT = { + "pca": pca, + "num_PCs": num_PCs, + "cv_results": knn_gscv.cv_results_, + "KNN": neigh, + "KNN train score": neigh.score(X_train, y_train), + "KNN test score": neigh.score(X_test, y_test), + } + + print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}") + print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}") + + # measure pred score on each subj + + ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "dFC method": list(), + "KNN accuracy": list(), + } + for subj in SUBJECTS: + ML_scores["subj_id"].append(subj) + if subj in train_subjects: + ML_scores["group"].append("train") + features = X_train[subj_label_train == subj, :] + target = y_train[subj_label_train == subj] + elif subj in test_subjects: + ML_scores["group"].append("test") + features = X_test[subj_label_test == subj, :] + target = y_test[subj_label_test == subj] + + pred = neigh.predict(features) + + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) + + ML_scores["task"].append(task) + ML_scores["dFC method"].append(measure_name) + + return ML_RESULT, ML_scores + + +def run_classification( + TASKS, + roi_root, + dFC_root, + output_root, + dynamic_pred="no", + normalize_dFC=True, +): + ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "dFC method": list(), + "KNN accuracy": list(), + } + for dFC_id in range(0, 7): + print(f"=================== dFC {dFC_id} ===================") + + ML_RESULT = {} + for task_id, task in enumerate(TASKS): + ML_RESULT_new, ML_scores_new = task_presence_classification( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + ML_RESULT[task] = ML_RESULT_new + for key in ML_scores: + ML_scores[key].extend(ML_scores_new[key]) + + folder = f"{output_root}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) + + np.save(f"{folder}/ML_scores_classify.npy", ML_scores) + + +####################################################################################### + +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to apply Machine Learning on dFC results to predict task presence. + """ + + parser = argparse.ArgumentParser(description=HELPTEXT) + + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + + args = parser.parse_args() + + dataset_info_file = args.dataset_info + + # Read global configs + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + + print("Task presence prediction started ...") + + TASKS = dataset_info["TASKS"] + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) + else: + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["roi_root"]: + roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) + else: + roi_root = dataset_info["roi_root"] + + if "{main_root}" in dataset_info["dFC_root"]: + dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root) + else: + dFC_root = dataset_info["dFC_root"] + + if "{main_root}" in dataset_info["ML_root"]: + ML_root = dataset_info["ML_root"].replace("{main_root}", main_root) + else: + ML_root = dataset_info["ML_root"] + + extract_task_features( + TASKS=TASKS, + roi_root=roi_root, + output_root=ML_root, + ) + run_classification( + TASKS=TASKS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + dynamic_pred="no", + normalize_dFC=True, + ) + + print("Task presence prediction CODE finished running.") + +####################################################################################### From bf1e25b96950d09b0133b0138e133269e381c499 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 29 May 2024 15:18:45 -0400 Subject: [PATCH 020/401] add run to ML.py --- task_dFC/ML.py | 106 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 28 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 2e19811..9f14283 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -16,7 +16,7 @@ ####################################################################################### -def find_available_subjects(dFC_root, task, dFC_id=None): +def find_available_subjects(dFC_root, task, run=None, dFC_id=None): """ Find the subjects that have dFC results for the given task and dFC_id (method). """ @@ -31,6 +31,8 @@ def find_available_subjects(dFC_root, task, dFC_id=None): ALL_DFC_FILES = [ dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file ] + if run is not None: + ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if run in dFC_file] ALL_DFC_FILES.sort() if len(ALL_DFC_FILES) > 0: SUBJECTS.append(subj_folder) @@ -165,24 +167,34 @@ def dFC_feature_extraction( dFC_id, roi_root, dFC_root, + run=None, dynamic_pred="no", normalize_dFC=True, ): """ Extract features and target for task presence classification for all subjects. + if run is specified, dFC results for that run will be used. """ X_train = None y_train = None subj_label_train = list() for subj in train_subjects: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() X_subj, y_subj = dFC_feature_extraction_subj_lvl( dFC=dFC, @@ -203,13 +215,21 @@ def dFC_feature_extraction( y_test = None subj_label_test = list() for subj in test_subjects: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() X_subj, y_subj = dFC_feature_extraction_subj_lvl( dFC=dFC, @@ -246,17 +266,26 @@ def task_presence_classification( dFC_id, roi_root, dFC_root, + run=None, dynamic_pred="no", normalize_dFC=True, train_test_ratio=0.8, explained_var_threshold=0.95, ): - print(f"=============== {task} ===============") + """ + perform task presence classification using KNN for a given task and dFC method and run. + """ + if run is None: + print(f"=============== {task} ===============") + else: + print(f"=============== {task} {run} ===============") if task == "task-restingstate": return - SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id) + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, dFC_id=dFC_id + ) # randomly select train_test_ratio of the subjects for training # and rest for testing using numpy.random.choice @@ -276,6 +305,7 @@ def task_presence_classification( dFC_id=dFC_id, roi_root=roi_root, dFC_root=dFC_root, + run=run, dynamic_pred=dynamic_pred, normalize_dFC=normalize_dFC, ) @@ -332,6 +362,7 @@ def task_presence_classification( "subj_id": list(), "group": list(), "task": list(), + "run": list(), "dFC method": list(), "KNN accuracy": list(), } @@ -351,6 +382,7 @@ def task_presence_classification( ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) ML_scores["task"].append(task) + ML_scores["run"].append(run) ML_scores["dFC method"].append(measure_name) return ML_RESULT, ML_scores @@ -358,6 +390,7 @@ def task_presence_classification( def run_classification( TASKS, + RUNS, roi_root, dFC_root, output_root, @@ -368,6 +401,7 @@ def run_classification( "subj_id": list(), "group": list(), "task": list(), + "run": list(), "dFC method": list(), "KNN accuracy": list(), } @@ -376,17 +410,25 @@ def run_classification( ML_RESULT = {} for task_id, task in enumerate(TASKS): - ML_RESULT_new, ML_scores_new = task_presence_classification( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - ML_RESULT[task] = ML_RESULT_new - for key in ML_scores: - ML_scores[key].extend(ML_scores_new[key]) + if RUNS is None: + RUNS = {task: [None]} + ML_RESULT[task] = {} + for run in RUNS[task]: + ML_RESULT_new, ML_scores_new = task_presence_classification( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + if run is None: + ML_RESULT[task] = ML_RESULT_new + else: + ML_RESULT[task][run] = ML_RESULT_new + for key in ML_scores: + ML_scores[key].extend(ML_scores_new[key]) folder = f"{output_root}" if not os.path.exists(folder): @@ -419,6 +461,13 @@ def run_classification( print("Task presence prediction started ...") TASKS = dataset_info["TASKS"] + if "RUNS" in dataset_info: + if dataset_info["RUNS"] is not None: + RUNS = dataset_info["RUNS"] + else: + RUNS = None + else: + RUNS = None if "{dataset}" in dataset_info["main_root"]: main_root = dataset_info["main_root"].replace( @@ -449,6 +498,7 @@ def run_classification( ) run_classification( TASKS=TASKS, + RUNS=RUNS, roi_root=roi_root, dFC_root=dFC_root, output_root=ML_root, From 3043c43258c0e9b3d6c4d6688d0cb6d4009248a7 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 29 May 2024 16:13:47 -0400 Subject: [PATCH 021/401] minor fix --- task_dFC/ML.py | 92 +++++++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 9f14283..0e5b626 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -39,11 +39,12 @@ def find_available_subjects(dFC_root, task, run=None, dFC_id=None): return SUBJECTS -def extract_task_features(TASKS, roi_root, output_root): +def extract_task_features(TASKS, RUNS, roi_root, output_root): """ Extract task features from the event data.""" task_features = { "task": list(), + "run": list(), "relative_task_on": list(), "avg_task_duration": list(), "var_task_duration": list(), @@ -57,45 +58,57 @@ def extract_task_features(TASKS, roi_root, output_root): if task == "task-restingstate": continue - SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task) + if RUNS is None: + RUNS = {task: [None]} + for run in RUNS[task]: - for subj in SUBJECTS: - # event data - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - Fs_task = task_data["Fs_task"] - TR_task = 1 / Fs_task - - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=TR_task, - TR_mri=task_data["TR_mri"], - binary=True, - ) - - relative_task_on = task_utils.relative_task_on(task_presence) - # task duration - avg_task_duration, var_task_duration = task_utils.task_duration( - task_presence, task_data["TR_mri"] - ) - # rest duration - avg_rest_duration, var_rest_duration = task_utils.rest_duration( - task_presence, task_data["TR_mri"] - ) - # freq of transitions - num_of_transitions, relative_transition_freq = task_utils.transition_freq( - task_presence - ) - - task_features["task"].append(task) - task_features["relative_task_on"].append(relative_task_on) - task_features["avg_task_duration"].append(avg_task_duration) - task_features["var_task_duration"].append(var_task_duration) - task_features["avg_rest_duration"].append(avg_rest_duration) - task_features["var_rest_duration"].append(var_rest_duration) - task_features["num_of_transitions"].append(num_of_transitions) - task_features["relative_transition_freq"].append(relative_transition_freq) + SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, run=run) + + for subj in SUBJECTS: + # event data + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + ) + + relative_task_on = task_utils.relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = task_utils.task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = task_utils.rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = task_utils.transition_freq( + task_presence + ) + + task_features["task"].append(task) + task_features["run"].append(run) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append(relative_transition_freq) folder = f"{output_root}" if not os.path.exists(folder): @@ -493,6 +506,7 @@ def run_classification( extract_task_features( TASKS=TASKS, + RUNS=RUNS, roi_root=roi_root, output_root=ML_root, ) From 351c5f23749f7806f0df7d4a1c94b738dc486e51 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 29 May 2024 17:00:33 -0400 Subject: [PATCH 022/401] fix event_types issue --- pydfc/task_utils.py | 15 ++++++++++----- task_dFC/nifti_to_roi_signal.py | 10 ++-------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index a24b3cf..8598ca3 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -19,10 +19,10 @@ def events_time_to_labels( - events, TR_mri, num_time_mri, event_types=[], oversampling=50, return_0_1=False + events, TR_mri, num_time_mri, event_types=None, oversampling=50, return_0_1=False ): """ - event_types is a list of event types to be considered. If None, 0 and 1s will be returned. + event_types is a list of event types to be considered. If None, it will found based on events. Assigns the longest event in each TR to that TR (in the interval from last TR to current TR). It assumes that the first time point is TR0 which corresponds to [0 sec, TR sec] interval. oversampling: number of samples per TR_mri to improve the time resolution of tasks @@ -43,6 +43,9 @@ def events_time_to_labels( events[0, trial_type_idx] == "trial_type" ), "Something went wrong with the events file! The trial_type column was not found!" + if event_types is None: + event_types = ["rest"] + list(np.unique(events[1:, trial_type_idx])) + Fs = float(1 / TR_mri) * oversampling num_time_task = int(num_time_mri * oversampling) event_labels = np.zeros((num_time_task, 1)) @@ -52,8 +55,10 @@ def events_time_to_labels( continue if events[i, trial_type_idx] in event_types: - if events[i, trial_type_idx] == "rest": - warnings.warn("trial types should not include 'rest'") + if ("rest" in events[i, trial_type_idx]) or ( + "Rest" in events[i, trial_type_idx] + ): + raise ValueError("trial types should not include 'rest'") start_time = float(events[i, onset_idx]) end_time = float(events[i, onset_idx]) + float(events[i, duration_idx]) start_timepoint = int(np.rint(start_time * Fs)) @@ -65,7 +70,7 @@ def events_time_to_labels( if return_0_1: event_labels = np.multiply(event_labels != 0, 1) - return event_labels, Fs + return event_labels, Fs, event_types ################################# Visualization Functions #################################### diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 7ee8870..507e714 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -109,17 +109,11 @@ def run_roi_signal_extraction( events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}" events = np.genfromtxt(events_file, delimiter="\t", dtype=str) # get the event labels - # check that "rest" does not already exist in the event types - if any( - ["rest" in event_type for event_type in list(np.unique(events[1:, 2]))] - ): - raise ValueError("Event types should not include 'rest'") - event_types = ["rest"] + list(np.unique(events[1:, 2])) - event_labels, Fs_task = task_utils.events_time_to_labels( + event_labels, Fs_task, event_types = task_utils.events_time_to_labels( events=events, TR_mri=TR_mri, num_time_mri=num_time_mri, - event_types=event_types, + event_types=None, oversampling=oversampling, return_0_1=False, ) From 4eeb16c3a0a39da81c92a925f6aa4b55ab325725 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 29 May 2024 19:30:49 -0400 Subject: [PATCH 023/401] set binarizing method to median --- pydfc/task_utils.py | 16 ++++++++++++---- task_dFC/ML.py | 2 ++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index 8598ca3..fad46d9 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -237,7 +237,9 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"): return events_hrf_ds -def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=True): +def extract_task_presence( + event_labels, TR_task, TR_mri, TR_array=None, binary=True, binarizing_method="median" +): """ event_labels: event labels including 0 and event ids at the time each event happens TR_task: TR of task @@ -247,6 +249,7 @@ def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=T This function extracts the task presence from the event labels and returns it in the same time points as the dFC data It also downsamples the task presence to the time points of the dFC data if binary is True, the task presence is binarized using the mean of the task presence + binarizing_method: 'median' or 'mean' """ # event_labels_all_task is all conditions together, rest vs. task times @@ -266,9 +269,14 @@ def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=T event_labels_all_task_hrf = event_labels_all_task_hrf[:, 1] if binary: - task_presence = np.where( - event_labels_all_task_hrf > np.mean(event_labels_all_task_hrf), 1, 0 - ) + if binarizing_method == "median": + task_presence = np.where( + event_labels_all_task_hrf > np.median(event_labels_all_task_hrf), 1, 0 + ) + elif binarizing_method == "mean": + task_presence = np.where( + event_labels_all_task_hrf > np.mean(event_labels_all_task_hrf), 1, 0 + ) else: task_presence = event_labels_all_task_hrf diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 0e5b626..d70b4f9 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -84,6 +84,7 @@ def extract_task_features(TASKS, RUNS, roi_root, output_root): TR_task=TR_task, TR_mri=task_data["TR_mri"], binary=True, + binarizing_method="median", ) relative_task_on = task_utils.relative_task_on(task_presence) @@ -141,6 +142,7 @@ def dFC_feature_extraction_subj_lvl( TR_mri=task_data["TR_mri"], TR_array=TR_array, binary=True, + binarizing_method="median", ) features = dFC_vecs From c042b44d49d19e9e6ac4690f27d22af96e9f7dcf Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 29 May 2024 21:44:57 -0400 Subject: [PATCH 024/401] change simul tasks --- simul_dFC/task_data_simulator.py | 38 +++++--------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 7823932..7fe64c3 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -45,24 +45,11 @@ print(f"subject-level simulation started running ... for subject: {subj_id} ...") all_task_info = { - "task-midFreqMidRest": { - "task_name": "task-midFreqMidRest", - "onset_time": onset_time, - "task_duration": 12.0, - "task_block_duration": 30.0, - "sim_length": sim_length, - "BOLD_period": BOLD_period, - "TAVG_period": TAVG_period, - "global_conn_coupling_coef": global_conn_coupling_coef, - "D": D, - "conn_speed": conn_speed, - "dt": dt, - }, "task-lowFreqLongRest": { "task_name": "task-lowFreqLongRest", "onset_time": onset_time, - "task_duration": 20.0, - "task_block_duration": 40.0, + "task_duration": 8.0, + "task_block_duration": 20.0, "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, @@ -74,21 +61,8 @@ "task-lowFreqShortRest": { "task_name": "task-lowFreqShortRest", "onset_time": onset_time, - "task_duration": 20.0, - "task_block_duration": 25.0, - "sim_length": sim_length, - "BOLD_period": BOLD_period, - "TAVG_period": TAVG_period, - "global_conn_coupling_coef": global_conn_coupling_coef, - "D": D, - "conn_speed": conn_speed, - "dt": dt, - }, - "task-lowFreqShortTask": { - "task_name": "task-lowFreqShortTask", - "onset_time": onset_time, - "task_duration": 5.0, - "task_block_duration": 30.0, + "task_duration": 12.0, + "task_block_duration": 20.0, "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, @@ -123,11 +97,11 @@ "conn_speed": conn_speed, "dt": dt, }, - "task-midFreqMidRestNoisy": { + "task-lowFreqShortRestNoisy": { "task_name": "task-midFreqMidRestNoisy", "onset_time": onset_time, "task_duration": 12.0, - "task_block_duration": 30.0, + "task_block_duration": 20.0, "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, From 963fcfdd487639b62d7cd9293c770bebbc371bfa Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 30 May 2024 12:10:04 -0400 Subject: [PATCH 025/401] handle Rest in events --- pydfc/task_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index fad46d9..a807da1 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -44,7 +44,15 @@ def events_time_to_labels( ), "Something went wrong with the events file! The trial_type column was not found!" if event_types is None: - event_types = ["rest"] + list(np.unique(events[1:, trial_type_idx])) + event_types = list(np.unique(events[1:, trial_type_idx])) + # if rest is already there, remove it + if "rest" in event_types: + warnings.warn("rest is already in the event types") + event_types.remove("rest") + if "Rest" in event_types: + warnings.warn("Rest is already in the event types") + event_types.remove("Rest") + event_types = ["rest"] + event_types Fs = float(1 / TR_mri) * oversampling num_time_task = int(num_time_mri * oversampling) @@ -58,7 +66,7 @@ def events_time_to_labels( if ("rest" in events[i, trial_type_idx]) or ( "Rest" in events[i, trial_type_idx] ): - raise ValueError("trial types should not include 'rest'") + continue start_time = float(events[i, onset_idx]) end_time = float(events[i, onset_idx]) + float(events[i, duration_idx]) start_timepoint = int(np.rint(start_time * Fs)) From f75da420401cc389a0037f6aeb3057410a27c499 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 30 May 2024 13:25:07 -0400 Subject: [PATCH 026/401] add ses --- task_dFC/nifti_to_roi_signal.py | 78 ++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 507e714..5c19561 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -12,16 +12,26 @@ ################################# FUNCTIONS ################################# def run_roi_signal_extraction( - subj, task, main_root, fmriprep_root, bold_suffix, output_root + subj, + task, + main_root, + fmriprep_root, + bold_suffix, + output_root, + session="", ): """ Extract ROI signals and task labels for a given subject and task + and optionally session. """ # find the func file for this subject and task try: - ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") + if session == "": + ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") + else: + ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/{session}/func/") except FileNotFoundError: - print(f"Subject {subj} not found in {fmriprep_root}") + print(f"Subject {subj} {session} not found in {fmriprep_root}") return ALL_TASK_FILES = [ @@ -32,7 +42,7 @@ def run_roi_signal_extraction( if not len(ALL_TASK_FILES) >= 1: # if the func file is not found, exclude the subject - print(f"Func file not found for {subj} {task}") + print(f"Func file not found for {subj} {session} {task}") return # there might be multiple runs for the same task @@ -58,11 +68,13 @@ def run_roi_signal_extraction( for run in RUNS: task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0] - nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}" - info_file = ( - f"{main_root}/bids/{subj}/func/{task_file.replace(bold_suffix, '_bold.json')}" - ) - + if session == "": + nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}" + task_events_root = f"{main_root}/bids/{subj}/func" + else: + nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}" + task_events_root = f"{main_root}/bids/{subj}/{session}/func" + info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}" ################################# LOAD JSON INFO ######################### # Opening JSON file as a dictionary f = open(info_file) @@ -91,7 +103,6 @@ def run_roi_signal_extraction( task_labels = np.zeros((int(num_time_mri * oversampling), 1)) Fs_task = float(1 / TR_mri) * oversampling else: - task_events_root = f"{main_root}/bids/{subj}/func" ALL_EVENTS_FILES = os.listdir(task_events_root) ALL_EVENTS_FILES = [ file_i @@ -99,11 +110,12 @@ def run_roi_signal_extraction( if (subj in file_i) and (task in file_i) and (run in file_i) + and (session in file_i) and ("events.tsv" in file_i) ] if not len(ALL_EVENTS_FILES) == 1: # if the events file is not found, exclude the subject - print(f"Events file not found for {subj} {task} {run}") + print(f"Events file not found for {subj} {session} {task} {run}") return # load the tsv events file events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}" @@ -134,14 +146,23 @@ def run_roi_signal_extraction( "TR_mri": TR_mri, "num_time_mri": num_time_mri, } + + if session == "": + subj_session_prefix = f"{subj}" + output_dir = f"{output_root}/{subj}" + else: + subj_session_prefix = f"{subj}_{session}" + output_dir = f"{output_root}/{subj}/{session}" + if multi_run_flag: - output_file_prefix = f"{subj}_{task}_{run}" + output_file_prefix = f"{subj_session_prefix}_{task}_{run}" else: - output_file_prefix = f"{subj}_{task}" - if not os.path.exists(f"{output_root}/{subj}/"): - os.makedirs(f"{output_root}/{subj}/") - np.save(f"{output_root}/{subj}/{output_file_prefix}_time-series.npy", time_series) - np.save(f"{output_root}/{subj}/{output_file_prefix}_task-data.npy", task_data) + output_file_prefix = f"{subj_session_prefix}_{task}" + + if not os.path.exists(f"{output_dir}/"): + os.makedirs(f"{output_dir}/") + np.save(f"{output_dir}/{output_file_prefix}_time-series.npy", time_series) + np.save(f"{output_dir}/{output_file_prefix}_task-data.npy", task_data) ######################################################################################## @@ -171,6 +192,9 @@ def run_roi_signal_extraction( ) TASKS = dataset_info["TASKS"] + SESSIONS = dataset_info["SESSIONS"] + if SESSIONS is None: + SESSIONS = [""] if "{dataset}" in dataset_info["main_root"]: main_root = dataset_info["main_root"].replace( @@ -189,15 +213,17 @@ def run_roi_signal_extraction( else: output_root = dataset_info["roi_root"] - for task in TASKS: - run_roi_signal_extraction( - subj=participant_id, - task=task, - main_root=main_root, - fmriprep_root=fmriprep_root, - bold_suffix=dataset_info["bold_suffix"], - output_root=output_root, - ) + for session in SESSIONS: + for task in TASKS: + run_roi_signal_extraction( + subj=participant_id, + task=task, + main_root=main_root, + fmriprep_root=fmriprep_root, + bold_suffix=dataset_info["bold_suffix"], + output_root=output_root, + session=session, + ) print( f"subject-level ROI signal extraction CODE finished running ... for subject: {participant_id} ..." From 8d13528853b5612775a5869fe46c59a0cd393c96 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 30 May 2024 18:09:04 -0400 Subject: [PATCH 027/401] change FCS estimate --- pydfc/data_loader.py | 68 +++++----- task_dFC/FCS_estimate.py | 229 ++++++++++++++++++-------------- task_dFC/dFC_assessment.py | 2 +- task_dFC/nifti_to_roi_signal.py | 2 +- 4 files changed, 162 insertions(+), 139 deletions(-) diff --git a/pydfc/data_loader.py b/pydfc/data_loader.py index e14b2fc..fba1ced 100644 --- a/pydfc/data_loader.py +++ b/pydfc/data_loader.py @@ -322,25 +322,18 @@ def multi_nifti2timeseries( def load_TS( data_root, file_name, - SESSIONs, subj_id2load=None, task=None, + session=None, run=None, ): """ load a TIME_SERIES object from a .npy file - if SESSIONs is a list, it will load all the sessions, - if it is a string, it will load that session if subj_id2load is None, it will load all the subjects file_name: name of the file to load - format example: {subj_id}_{task}_{run}_time-series.npy + format example: {subj_id}_{session}_{task}_{run}_time-series.npy (keep the {} for the variables) """ - # check if SESSIONs is a list or a string - flag = False - if type(SESSIONs) is str: - SESSIONs = [SESSIONs] - flag = True if subj_id2load is None: SUBJECTS = find_subj_list(data_root) @@ -348,37 +341,42 @@ def load_TS( assert "sub-" in subj_id2load, "subj_id2load must start with 'sub-'" SUBJECTS = [subj_id2load] - TS = {} - for session in SESSIONs: - TS[session] = None - for subj in SUBJECTS: - subj_fldr = subj - # make the file_name - TS_file = deepcopy(file_name) - if "{subj_id}" in file_name: - TS_file = TS_file.replace("{subj_id}", subj) - if "{task}" in file_name: - assert task is not None, "task must be provided" - TS_file = TS_file.replace("{task}", task) - if "{run}" in file_name: - assert run is not None, "run must be provided" - TS_file = TS_file.replace("{run}", run) - - try: + TS = None + for subj in SUBJECTS: + subj_fldr = subj + # make the file_name + TS_file = deepcopy(file_name) + if "{subj_id}" in file_name: + TS_file = TS_file.replace("{subj_id}", subj) + if "{task}" in file_name: + assert task is not None, "task must be provided" + TS_file = TS_file.replace("{task}", task) + if "{session}" in file_name: + assert session is not None, "session must be provided" + TS_file = TS_file.replace("{session}", session) + if "{run}" in file_name: + assert run is not None, "run must be provided" + TS_file = TS_file.replace("{run}", run) + + try: + if session is None: time_series = np.load( f"{data_root}/{subj_fldr}/{TS_file}", allow_pickle="True" ).item() - except FileNotFoundError: - print(f"File {TS_file} not found for {subj}") - continue - - if TS[session] is None: - TS[session] = time_series else: - TS[session].concat_ts(time_series) + time_series = np.load( + f"{data_root}/{subj_fldr}/{session}/{TS_file}", + allow_pickle="True", + ).item() + except FileNotFoundError: + print(f"File {TS_file} not found for {subj}") + continue + + if TS is None: + TS = time_series + else: + TS.concat_ts(time_series) - if flag: - return TS[SESSIONs[0]] return TS diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index d171085..027177a 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -1,3 +1,5 @@ +import argparse +import json import os import time import warnings @@ -12,121 +14,54 @@ os.environ["NUMEXPR_NUM_THREADS"] = "16" os.environ["OMP_NUM_THREADS"] = "16" -################################# Parameters ################################# -# data paths -dataset = "ds003242" -# main_root = f"../../DATA/{dataset}" # for local -main_root = f"/data/origami/dFC/DATA/task-based/openneuro/{dataset}" # for server -roi_root = f"{main_root}/derivatives/ROI_timeseries" -output_root = f"{main_root}/derivatives/fitted_MEASURES" - -# for consistency we use 0 for resting state -TASKS = ["task-CIC", "task-midloc"] - -# default RUNS = None -RUNS = None -RUNS = { - "task-CIC": ["run-001", "run-002", "run-003", "run-004", "run-005", "run-006"], - "task-midloc": ["run-001"], -} - -job_id = int(os.getenv("SGE_TASK_ID")) -TASK_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 -if TASK_id >= len(TASKS): - print("TASK_id out of TASKS") - exit() -task = TASKS[TASK_id] - -###### MEASUREMENT PARAMETERS ###### - -# W is in sec - -params_methods = { - # Sliding Parameters - "W": 12, - "n_overlap": 1.0, - "sw_method": "pear_corr", - "tapered_window": True, - # TIME_FREQ - "TF_method": "WTC", - # CLUSTERING AND DHMM - "clstr_base_measure": "SlidingWindow", - # HMM - "hmm_iter": 20, - "dhmm_obs_state_ratio": 16 / 24, - # State Parameters - "n_states": 5, - "n_subj_clstrs": 10, - # Parallelization Parameters - "n_jobs": 2, - "verbose": 0, - "backend": "loky", - # SESSION - "session": task, - # Hyper Parameters - "normalization": True, - "num_subj": None, - "num_time_point": None, -} - -###### HYPER PARAMETERS ALTERNATIVE ###### - -MEASURES_name_lst = [ - "SlidingWindow", - "Time-Freq", - "CAP", - "ContinuousHMM", - "Windowless", - "Clustering", - "DiscreteHMM", -] - -alter_hparams = { - # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'], - # 'n_overlap': [0, 0.25, 0.75, 1], - # 'n_states': [6, 16], - # # 'normalization': [], - # 'num_subj': [50, 100, 200], - # 'num_select_nodes': [30, 50, 333], - # 'num_time_point': [800, 1000], - # 'Fs_ratio': [0.50, 0.75, 1.5], - # 'noise_ratio': [1.00, 2.00, 3.00], - # 'num_realization': [] -} - -###### MultiAnalysis PARAMETERS ###### - -params_multi_analysis = { - # Parallelization Parameters - "n_jobs": None, - "verbose": 0, - "backend": "loky", -} - -if RUNS is None: - RUNS = {task: [None]} -for run in RUNS[task]: +######################################################################################## + + +def run_FCS_estimate( + params_methods, + MEASURES_name_lst, + alter_hparams, + params_multi_analysis, + task, + roi_root, + output_root, + session=None, + run=None, +): + if session is None: + output_dir = f"{output_root}" + else: + output_dir = f"{output_root}/{session}" + if run is None: print(f"TASK: {task} started ...") - file_suffix = f"{task}" - BOLD_file_name = "{subj_id}_{task}_time-series.npy" + if session is None: + BOLD_file_name = "{subj_id}_{task}_time-series.npy" + file_suffix = f"{task}" + else: + BOLD_file_name = "{subj_id}_{session}_{task}_time-series.npy" + file_suffix = f"{session}_{task}" else: print(f"TASK: {task}, RUN: {run} started ...") - file_suffix = f"{task}_{run}" - BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy" + if session is None: + BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy" + file_suffix = f"{task}_{run}" + else: + BOLD_file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy" + file_suffix = f"{session}_{task}_{run}" ################################# LOAD DATA ################################# BOLD = data_loader.load_TS( data_root=roi_root, file_name=BOLD_file_name, - SESSIONs=task, subj_id2load=None, task=task, + session=session, run=run, ) ################################ Measures of dFC ################################# MA = MultiAnalysis( - analysis_name=f"task-based-dFC-{dataset}-{file_suffix}", **params_multi_analysis + analysis_name=f"task-based-dFC-{file_suffix}", **params_multi_analysis ) MEASURES_lst = MA.measures_initializer( @@ -151,9 +86,99 @@ # Save if not os.path.exists(f"{output_root}"): os.makedirs(f"{output_root}") - np.save(f"{output_root}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure) + np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure) print(f"Measurement required {time.time() - tic:0.3f} seconds.") - np.save(f"{output_root}/multi-analysis_{file_suffix}.npy", MA) + np.save(f"{output_dir}/multi-analysis_{file_suffix}.npy", MA) + + +######################################################################################## + +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to fit dFC methods for a given task. + """ + + parser = argparse.ArgumentParser(description=HELPTEXT) + + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + parser.add_argument("--methods_config", type=str, help="methods config file") + + args = parser.parse_args() + + dataset_info_file = args.dataset_info + methods_config_file = args.methods_config + + # Read dataset info + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + + # Read methods config + with open(methods_config_file, "r") as f: + methods_config = json.load(f) + TASKS = dataset_info["TASKS"] + + job_id = int(os.getenv("SGE_TASK_ID")) + TASK_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 + if TASK_id >= len(TASKS): + print("TASK_id out of TASKS") + exit() + task = TASKS[TASK_id] + + print(f"FCS estimation CODE started running ... for task: {task} ...") + + SESSIONS = dataset_info["SESSIONS"] + if SESSIONS is None: + SESSIONS = [None] + RUNS = dataset_info["RUNS"] + if RUNS is None: + RUNS = {task: [None]} + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) + else: + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["fmriprep_root"]: + fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root) + else: + fmriprep_root = dataset_info["fmriprep_root"] + + if "{main_root}" in dataset_info["roi_root"]: + roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) + else: + roi_root = dataset_info["roi_root"] + + if "{main_root}" in dataset_info["fitted_measures_root"]: + fitted_measures_root = dataset_info["fitted_measures_root"].replace( + "{main_root}", main_root + ) + else: + fitted_measures_root = dataset_info["fitted_measures_root"] + + # methods params + params_methods = methods_config["params_methods"] + MEASURES_name_lst = methods_config["MEASURES_name_lst"] + alter_hparams = methods_config["alter_hparams"] + params_multi_analysis = methods_config["params_multi_analysis"] + + for session in SESSIONS: + for run in RUNS[task]: + run_FCS_estimate( + params_methods=params_methods, + MEASURES_name_lst=MEASURES_name_lst, + alter_hparams=alter_hparams, + params_multi_analysis=params_multi_analysis, + task=task, + roi_root=roi_root, + output_root=fitted_measures_root, + session=session, + run=run, + ) + + print(f"FCS estimation CODE finished running ... for task: {task} ...") ################################################################################# diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py index b3068b0..84c77a1 100644 --- a/task_dFC/dFC_assessment.py +++ b/task_dFC/dFC_assessment.py @@ -158,7 +158,7 @@ def run_dFC_assess( dataset_info_file = args.dataset_info participant_id = args.participant_id - # Read global configs + # Read dataset info with open(dataset_info_file, "r") as f: dataset_info = json.load(f) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 5c19561..fc54d9c 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -183,7 +183,7 @@ def run_roi_signal_extraction( dataset_info_file = args.dataset_info participant_id = args.participant_id - # Read global configs + # Read dataset info with open(dataset_info_file, "r") as f: dataset_info = json.load(f) From 41e78b67c524a12488b69625b8bc1c611c1ebd63 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 30 May 2024 18:36:37 -0400 Subject: [PATCH 028/401] minor fix --- task_dFC/FCS_estimate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index 027177a..67f2591 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -84,8 +84,8 @@ def run_FCS_estimate( print("FCS estimation done.") # Save - if not os.path.exists(f"{output_root}"): - os.makedirs(f"{output_root}") + if not os.path.exists(f"{output_dir}"): + os.makedirs(f"{output_dir}") np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure) print(f"Measurement required {time.time() - tic:0.3f} seconds.") From a9107efbcc8e37d35138d8b054bfc11545c00f85 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 31 May 2024 12:05:23 -0400 Subject: [PATCH 029/401] event_types fix --- pydfc/simul_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py index d716498..b4e155f 100644 --- a/pydfc/simul_utils.py +++ b/pydfc/simul_utils.py @@ -184,7 +184,6 @@ def create_simul_task_info( """ ####################### EXTRACT TASK LABELS ####################### events = [] - event_types = ["rest", "task"] # using onset, task_duration, task_block_duration to create the events events.append(["onset", "duration", "trial_type"]) @@ -194,11 +193,10 @@ def create_simul_task_info( t += task_block_duration events = np.array(events) - event_labels, Fs_task = task_utils.events_time_to_labels( + event_labels, Fs_task, event_types = task_utils.events_time_to_labels( events=events, TR_mri=TR_mri, num_time_mri=num_time_mri, - event_types=event_types, oversampling=oversampling, return_0_1=False, ) From c16b59d25dec901eb8ca90979a5c0fe5bf5dd03b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 31 May 2024 12:21:08 -0400 Subject: [PATCH 030/401] minor fix --- simul_dFC/task_data_simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 7fe64c3..bf8709e 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -22,7 +22,7 @@ # data paths dataset = "ds000001" # main_root = f"./DATA/{dataset}" # for local -main_root = f"../../DATA/task-based/simulated/{dataset}" # for server +main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}" # for server output_root = f"{main_root}/derivatives/ROI_timeseries" # simulation parameters From 639a2d5c071ff1e5bf75262b00b5687f80a10ae4 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 31 May 2024 12:24:57 -0400 Subject: [PATCH 031/401] minor fix --- simul_dFC/task_data_simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index bf8709e..24aa92a 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -20,7 +20,7 @@ ################################# Parameters #################################### # data paths -dataset = "ds000001" +dataset = "ds000002" # main_root = f"./DATA/{dataset}" # for local main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}" # for server output_root = f"{main_root}/derivatives/ROI_timeseries" From 6c4c8851914bebe7dcbcb60f104f2ca3f986fbca Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 3 Jun 2024 13:00:55 -0400 Subject: [PATCH 032/401] minor fix --- task_dFC/FCS_estimate.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index 67f2591..42bb99f 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -129,10 +129,17 @@ def run_FCS_estimate( print(f"FCS estimation CODE started running ... for task: {task} ...") - SESSIONS = dataset_info["SESSIONS"] + if "SESSIONS" in dataset_info: + SESSIONS = dataset_info["SESSIONS"] + else: + SESSIONS = None if SESSIONS is None: SESSIONS = [None] - RUNS = dataset_info["RUNS"] + + if "RUNS" in dataset_info: + RUNS = dataset_info["RUNS"] + else: + RUNS = None if RUNS is None: RUNS = {task: [None]} From 5cae1c9344fff477546d3c00691872e6c9f4e3f9 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 3 Jun 2024 16:43:12 -0400 Subject: [PATCH 033/401] minor fix --- task_dFC/FCS_estimate.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index 42bb99f..064988c 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -150,11 +150,6 @@ def run_FCS_estimate( else: main_root = dataset_info["main_root"] - if "{main_root}" in dataset_info["fmriprep_root"]: - fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root) - else: - fmriprep_root = dataset_info["fmriprep_root"] - if "{main_root}" in dataset_info["roi_root"]: roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) else: From 725493e11146f68d267eb30558ac31f1c7ee2dcf Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 3 Jun 2024 18:37:01 -0400 Subject: [PATCH 034/401] add session to dFC assess --- task_dFC/dFC_assessment.py | 214 +++++++++++++++++++------------------ 1 file changed, 112 insertions(+), 102 deletions(-) diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py index 84c77a1..3b858a5 100644 --- a/task_dFC/dFC_assessment.py +++ b/task_dFC/dFC_assessment.py @@ -23,121 +23,113 @@ def run_dFC_assess( roi_root, fitted_measures_root, output_root, + session=None, + run=None, ): + if session is None: + output_dir = f"{output_root}/{subj_id}" + fitted_measures_dir = f"{fitted_measures_root}" + else: + output_dir = f"{output_root}/{subj_id}/{session}" + fitted_measures_dir = f"{fitted_measures_root}/{session}" + + if run is None: + if session is None: + print(f"Subject-level dFC assessment started for TASK: {task} ...") + input_root = f"{roi_root}/{subj_id}" + BOLD_file_name = "{subj_id}_{task}_time-series.npy" + file_suffix = f"{task}" + else: + print( + f"Subject-level dFC assessment started for Session {session}, TASK: {task} ..." + ) + input_root = f"{roi_root}/{subj_id}/{session}" + BOLD_file_name = "{subj_id}_{session}_{task}_time-series.npy" + file_suffix = f"{session}_{task}" + else: + if session is None: + print( + f"Subject-level dFC assessment started for TASK: {task}, RUN: {run} ..." + ) + input_root = f"{roi_root}/{subj_id}" + BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy" + file_suffix = f"{task}_{run}" + else: + print( + f"Subject-level dFC assessment started for Session {session}, TASK: {task}, RUN: {run} ..." + ) + input_root = f"{roi_root}/{subj_id}/{session}" + BOLD_file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy" + file_suffix = f"{session}_{task}_{run}" # check if the subject has this task in roi_root - if not os.path.exists(f"{roi_root}/{subj_id}"): - print(f"Subject {subj_id} not found in {roi_root}") + if not os.path.exists(input_root): + print(f"{input_root} not found in {roi_root}") return - ALL_ROI_FILES = os.listdir(f"{roi_root}/{subj_id}/") + ALL_ROI_FILES = os.listdir(f"{input_root}/") ALL_ROI_FILES = [ roi_file for roi_file in ALL_ROI_FILES if ("_time-series.npy" in roi_file) and (task in roi_file) ] + if session is not None: + ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (session in roi_file)] + if run is not None: + ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (run in roi_file)] ALL_ROI_FILES.sort() # if there are no files for this task, return if not len(ALL_ROI_FILES) >= 1: - print(f"No time series files found for {subj_id} {task}") + print(f"No time series files found for {subj_id} {file_suffix}") return + ################################# LOAD FIT MEASURES ################################# + + MA = np.load( + f"{fitted_measures_dir}/multi-analysis_{file_suffix}.npy", + allow_pickle="TRUE", + ).item() + + ALL_RECORDS = os.listdir(f"{fitted_measures_dir}/") + ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)] + ALL_RECORDS.sort() + MEASURES_fit_lst = list() + for s in ALL_RECORDS: + fit_measure = np.load(f"{fitted_measures_dir}/{s}", allow_pickle="TRUE").item() + MEASURES_fit_lst.append(fit_measure) + MA.set_MEASURES_fit_lst(MEASURES_fit_lst) + print("fitted MEASURES are loaded ...") + + ################################# LOAD DATA ################################# + + BOLD = data_loader.load_TS( + data_root=roi_root, + file_name=BOLD_file_name, + subj_id2load=subj_id, + task=task, + session=session, + run=run, + ) - # check if "_run" exists in all the task file names - if all(["_run" in roi_file for roi_file in ALL_ROI_FILES]): - # find all the runs - RUNS = [ - roi_file[ - roi_file.find("_run") - + 1 : roi_file.find("_run") - + 1 - + roi_file[roi_file.find("_run") + 1 :].find("_") - ] - for roi_file in ALL_ROI_FILES - ] - # sort - RUNS.sort() - print(f"Found multiple runs for {subj_id} {task}: {RUNS}") - else: - RUNS = [None] - - for run in RUNS: - - # check if the subject has this task and run in roi_root - if run is None: - file_suffix = f"{task}" - if not os.path.exists( - f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy" - ): - print(f"Time series file not found for {subj_id} {task}") - continue - else: - print( - f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..." - ) - BOLD_file_name = "{subj_id}_{task}_time-series.npy" - else: - file_suffix = f"{task}_{run}" - if not os.path.exists( - f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy" - ): - print(f"Time series file not found for {subj_id} {task} {run}") - continue - else: - print( - f"subject-level dFC assessment CODE started running ... for task {task} and {run} of subject {subj_id} ..." - ) - BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy" - - ################################# LOAD FIT MEASURES ################################# - - MA = np.load( - f"{fitted_measures_root}/multi-analysis_{file_suffix}.npy", - allow_pickle="TRUE", - ).item() - - ALL_RECORDS = os.listdir(f"{fitted_measures_root}/") - ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)] - ALL_RECORDS.sort() - MEASURES_fit_lst = list() - for s in ALL_RECORDS: - fit_measure = np.load( - f"{fitted_measures_root}/{s}", allow_pickle="TRUE" - ).item() - MEASURES_fit_lst.append(fit_measure) - MA.set_MEASURES_fit_lst(MEASURES_fit_lst) - print("fitted MEASURES are loaded ...") - - ################################# LOAD DATA ################################# - - BOLD = data_loader.load_TS( - data_root=roi_root, - file_name=BOLD_file_name, - SESSIONs=task, - subj_id2load=subj_id, - task=task, - run=run, - ) - - ################################# dFC ASSESSMENT ################################# + ################################# dFC ASSESSMENT ################################# - tic = time.time() - print("Measurement Started ...") + tic = time.time() + print("Measurement Started ...") - print("dFC estimation started...") - dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD) - print("dFC estimation done.") + print("dFC estimation started...") + dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD) + print("dFC estimation done.") - print(f"Measurement required {time.time() - tic:0.3f} seconds.") + print(f"Measurement required {time.time() - tic:0.3f} seconds.") - ################################# SAVE DATA ################################# + ################################# SAVE DATA ################################# - folder = f"{output_root}/{subj_id}" - if not os.path.exists(folder): - os.makedirs(folder) + folder = f"{output_dir}/" + if not os.path.exists(folder): + os.makedirs(folder) - for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]): - np.save(f"{folder}/dFC_{file_suffix}_{dFC_id}.npy", dFC) + for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]): + np.save(f"{folder}dFC_{file_suffix}_{dFC_id}.npy", dFC) ####################################################################################### @@ -192,17 +184,35 @@ def run_dFC_assess( else: output_root = dataset_info["dFC_root"] - for task in TASKS: - run_dFC_assess( - subj_id=participant_id, - task=task, - roi_root=roi_root, - fitted_measures_root=fitted_measures_root, - output_root=output_root, - ) + if "SESSIONS" in dataset_info: + SESSIONS = dataset_info["SESSIONS"] + else: + SESSIONS = None + if SESSIONS is None: + SESSIONS = [None] + + if "RUNS" in dataset_info: + RUNS = dataset_info["RUNS"] + else: + RUNS = None + if RUNS is None: + RUNS = {task: [None] for task in TASKS} + + for session in SESSIONS: + for task in TASKS: + for run in RUNS[task]: + run_dFC_assess( + subj_id=participant_id, + task=task, + roi_root=roi_root, + fitted_measures_root=fitted_measures_root, + output_root=output_root, + session=session, + run=run, + ) print( - f"subject-level dFC assessment CODE finished running ... for subject: {participant_id} ..." + f"subject-level dFC assessment CODE finished running for subject: {participant_id}" ) ####################################################################################### From 9c412ec74299f7db350ad770e09e16335eed2be7 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 7 Jun 2024 13:36:30 -0400 Subject: [PATCH 035/401] add session to ML --- task_dFC/ML.py | 383 ++++++++++++++++++++++++++++++------------------- 1 file changed, 238 insertions(+), 145 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index d70b4f9..15c575d 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -16,7 +16,7 @@ ####################################################################################### -def find_available_subjects(dFC_root, task, run=None, dFC_id=None): +def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None): """ Find the subjects that have dFC results for the given task and dFC_id (method). """ @@ -25,7 +25,10 @@ def find_available_subjects(dFC_root, task, run=None, dFC_id=None): ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder] ALL_SUBJ_FOLDERS.sort() for subj_folder in ALL_SUBJ_FOLDERS: - ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") + if session is None: + ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") + else: + ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/") ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file] if dFC_id is not None: ALL_DFC_FILES = [ @@ -33,88 +36,93 @@ def find_available_subjects(dFC_root, task, run=None, dFC_id=None): ] if run is not None: ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if run in dFC_file] + if session is not None: + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if session in dFC_file + ] ALL_DFC_FILES.sort() if len(ALL_DFC_FILES) > 0: SUBJECTS.append(subj_folder) return SUBJECTS -def extract_task_features(TASKS, RUNS, roi_root, output_root): +def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, output_root): """ Extract task features from the event data.""" - task_features = { - "task": list(), - "run": list(), - "relative_task_on": list(), - "avg_task_duration": list(), - "var_task_duration": list(), - "avg_rest_duration": list(), - "var_rest_duration": list(), - "num_of_transitions": list(), - "relative_transition_freq": list(), - } - for task_id, task in enumerate(TASKS): - - if task == "task-restingstate": - continue - - if RUNS is None: - RUNS = {task: [None]} - for run in RUNS[task]: - - SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, run=run) - - for subj in SUBJECTS: - # event data - if run is None: - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", - allow_pickle="TRUE", - ).item() - else: - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", - allow_pickle="TRUE", - ).item() - Fs_task = task_data["Fs_task"] - TR_task = 1 / Fs_task - - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=TR_task, - TR_mri=task_data["TR_mri"], - binary=True, - binarizing_method="median", - ) + for session in SESSIONS: + task_features = { + "task": list(), + "run": list(), + "relative_task_on": list(), + "avg_task_duration": list(), + "var_task_duration": list(), + "avg_rest_duration": list(), + "var_rest_duration": list(), + "num_of_transitions": list(), + "relative_transition_freq": list(), + } + for task_id, task in enumerate(TASKS): - relative_task_on = task_utils.relative_task_on(task_presence) - # task duration - avg_task_duration, var_task_duration = task_utils.task_duration( - task_presence, task_data["TR_mri"] - ) - # rest duration - avg_rest_duration, var_rest_duration = task_utils.rest_duration( - task_presence, task_data["TR_mri"] - ) - # freq of transitions - num_of_transitions, relative_transition_freq = task_utils.transition_freq( - task_presence - ) + if task == "task-restingstate": + continue - task_features["task"].append(task) - task_features["run"].append(run) - task_features["relative_task_on"].append(relative_task_on) - task_features["avg_task_duration"].append(avg_task_duration) - task_features["var_task_duration"].append(var_task_duration) - task_features["avg_rest_duration"].append(avg_rest_duration) - task_features["var_rest_duration"].append(var_rest_duration) - task_features["num_of_transitions"].append(num_of_transitions) - task_features["relative_transition_freq"].append(relative_transition_freq) + if RUNS is None: + RUNS = {task: [None]} + for run in RUNS[task]: - folder = f"{output_root}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/task_features.npy", task_features) + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, session=session + ) + + for subj in SUBJECTS: + # event data + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + binarizing_method="mean", + ) + + relative_task_on = task_utils.relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = task_utils.task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = task_utils.rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = ( + task_utils.transition_freq(task_presence) + ) + + task_features["task"].append(task) + task_features["run"].append(run) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append( + relative_transition_freq + ) + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/task_features.npy", task_features) def dFC_feature_extraction_subj_lvl( @@ -142,7 +150,7 @@ def dFC_feature_extraction_subj_lvl( TR_mri=task_data["TR_mri"], TR_array=TR_array, binary=True, - binarizing_method="median", + binarizing_method="mean", ) features = dFC_vecs @@ -175,6 +183,63 @@ def dFC_feature_extraction_subj_lvl( return features, target +def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None): + """ + Load the dFC results for a given subject, task, dFC_id, run and session. + """ + if session is None: + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + + return dFC + + +def load_task_data(roi_root, subj, task, run=None, session=None): + """ + Load the task data for a given subject, task and run. + """ + if session is None: + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + + return task_data + + def dFC_feature_extraction( task, train_subjects, @@ -183,6 +248,7 @@ def dFC_feature_extraction( roi_root, dFC_root, run=None, + session=None, dynamic_pred="no", normalize_dFC=True, ): @@ -191,25 +257,23 @@ def dFC_feature_extraction( for all subjects. if run is specified, dFC results for that run will be used. """ + dFC_measure_name = None X_train = None y_train = None subj_label_train = list() for subj in train_subjects: - if run is None: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - else: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", - allow_pickle="TRUE", - ).item() + + dFC = load_dFC( + dFC_root=dFC_root, + subj=subj, + task=task, + dFC_id=dFC_id, + run=run, + session=session, + ) + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) X_subj, y_subj = dFC_feature_extraction_subj_lvl( dFC=dFC, @@ -226,25 +290,28 @@ def dFC_feature_extraction( X_train = np.concatenate((X_train, X_subj), axis=0) y_train = np.concatenate((y_train, y_subj), axis=0) + if dFC_measure_name is None: + dFC_measure_name = dFC.measure.measure_name + else: + assert ( + dFC_measure_name == dFC.measure.measure_name + ), "dFC measure is not consistent." + X_test = None y_test = None subj_label_test = list() for subj in test_subjects: - if run is None: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - else: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", - allow_pickle="TRUE", - ).item() + dFC = load_dFC( + dFC_root=dFC_root, + subj=subj, + task=task, + dFC_id=dFC_id, + run=run, + session=session, + ) + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) X_subj, y_subj = dFC_feature_extraction_subj_lvl( dFC=dFC, @@ -261,6 +328,13 @@ def dFC_feature_extraction( X_test = np.concatenate((X_test, X_subj), axis=0) y_test = np.concatenate((y_test, y_subj), axis=0) + if dFC_measure_name is None: + dFC_measure_name = dFC.measure.measure_name + else: + assert ( + dFC_measure_name == dFC.measure.measure_name + ), "dFC measure is not consistent." + print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) subj_label_train = np.array(subj_label_train) subj_label_test = np.array(subj_label_test) @@ -272,7 +346,7 @@ def dFC_feature_extraction( y_test, subj_label_train, subj_label_test, - dFC.measure.measure_name, + dFC_measure_name, ) @@ -282,6 +356,7 @@ def task_presence_classification( roi_root, dFC_root, run=None, + session=None, dynamic_pred="no", normalize_dFC=True, train_test_ratio=0.8, @@ -299,7 +374,7 @@ def task_presence_classification( return SUBJECTS = find_available_subjects( - dFC_root=dFC_root, task=task, run=run, dFC_id=dFC_id + dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id ) # randomly select train_test_ratio of the subjects for training @@ -321,6 +396,7 @@ def task_presence_classification( roi_root=roi_root, dFC_root=dFC_root, run=run, + session=session, dynamic_pred=dynamic_pred, normalize_dFC=normalize_dFC, ) @@ -406,51 +482,59 @@ def task_presence_classification( def run_classification( TASKS, RUNS, + SESSIONS, roi_root, dFC_root, output_root, dynamic_pred="no", normalize_dFC=True, ): - ML_scores = { - "subj_id": list(), - "group": list(), - "task": list(), - "run": list(), - "dFC method": list(), - "KNN accuracy": list(), - } - for dFC_id in range(0, 7): - print(f"=================== dFC {dFC_id} ===================") - - ML_RESULT = {} - for task_id, task in enumerate(TASKS): - if RUNS is None: - RUNS = {task: [None]} - ML_RESULT[task] = {} - for run in RUNS[task]: - ML_RESULT_new, ML_scores_new = task_presence_classification( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - if run is None: - ML_RESULT[task] = ML_RESULT_new - else: - ML_RESULT[task][run] = ML_RESULT_new - for key in ML_scores: - ML_scores[key].extend(ML_scores_new[key]) - - folder = f"{output_root}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) - - np.save(f"{folder}/ML_scores_classify.npy", ML_scores) + for session in SESSIONS: + if not session is None: + print(f"=================== {session} ===================") + ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "run": list(), + "dFC method": list(), + "KNN accuracy": list(), + } + for dFC_id in range(0, 7): + print(f"=================== dFC {dFC_id} ===================") + + ML_RESULT = {} + for task_id, task in enumerate(TASKS): + if RUNS is None: + RUNS = {task: [None]} + ML_RESULT[task] = {} + for run in RUNS[task]: + ML_RESULT_new, ML_scores_new = task_presence_classification( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + if run is None: + ML_RESULT[task] = ML_RESULT_new + else: + ML_RESULT[task][run] = ML_RESULT_new + for key in ML_scores: + ML_scores[key].extend(ML_scores_new[key]) + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) + + np.save(f"{folder}/ML_scores_classify.npy", ML_scores) ####################################################################################### @@ -484,6 +568,13 @@ def run_classification( else: RUNS = None + if "SESSIONS" in dataset_info: + SESSIONS = dataset_info["SESSIONS"] + else: + SESSIONS = None + if SESSIONS is None: + SESSIONS = [None] + if "{dataset}" in dataset_info["main_root"]: main_root = dataset_info["main_root"].replace( "{dataset}", dataset_info["dataset"] @@ -509,12 +600,14 @@ def run_classification( extract_task_features( TASKS=TASKS, RUNS=RUNS, + SESSIONS=SESSIONS, roi_root=roi_root, output_root=ML_root, ) run_classification( TASKS=TASKS, RUNS=RUNS, + SESSIONS=SESSIONS, roi_root=roi_root, dFC_root=dFC_root, output_root=ML_root, From 56b4c052c40d36b35f394158779e0d8c2379a4d9 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Sun, 9 Jun 2024 21:48:24 -0400 Subject: [PATCH 036/401] minor fix --- task_dFC/ML.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 15c575d..e24b61a 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -66,8 +66,6 @@ def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, output_root): if task == "task-restingstate": continue - if RUNS is None: - RUNS = {task: [None]} for run in RUNS[task]: SUBJECTS = find_available_subjects( @@ -505,8 +503,6 @@ def run_classification( ML_RESULT = {} for task_id, task in enumerate(TASKS): - if RUNS is None: - RUNS = {task: [None]} ML_RESULT[task] = {} for run in RUNS[task]: ML_RESULT_new, ML_scores_new = task_presence_classification( @@ -561,12 +557,11 @@ def run_classification( TASKS = dataset_info["TASKS"] if "RUNS" in dataset_info: - if dataset_info["RUNS"] is not None: - RUNS = dataset_info["RUNS"] - else: - RUNS = None + RUNS = dataset_info["RUNS"] else: RUNS = None + if RUNS is None: + RUNS = {task: [None] for task in TASKS} if "SESSIONS" in dataset_info: SESSIONS = dataset_info["SESSIONS"] From 6ccd229074b03cbcd1abc0ef96320911b24f4e4b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 11 Jun 2024 21:52:36 -0400 Subject: [PATCH 037/401] add logistic reg --- task_dFC/ML.py | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index e24b61a..bf50e27 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -4,6 +4,7 @@ import numpy as np from sklearn.decomposition import PCA +from sklearn.linear_model import LogisticRegression from sklearn.metrics import balanced_accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KNeighborsClassifier @@ -361,7 +362,8 @@ def task_presence_classification( explained_var_threshold=0.95, ): """ - perform task presence classification using KNN for a given task and dFC method and run. + perform task presence classification using logistic regression and KNN + for a given task and dFC method and run. """ if run is None: print(f"=============== {task} ===============") @@ -404,6 +406,23 @@ def task_presence_classification( print("task presence classification ...") + # logistic regression + logistic_reg = make_pipeline(StandardScaler(), LogisticRegression()) + # create a dictionary of all values we want to test for C + param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} + # use gridsearch to test all values for C + lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5) + # fit model to data + lr_gscv.fit(X_train, y_train) + + C = lr_gscv.best_params_["C"] + + log_reg = make_pipeline( + StandardScaler(), + LogisticRegression(C=C), + ).fit(X_train, y_train) + + # KNN # find num_PCs pca = PCA(svd_solver="full", whiten=False) pca.fit(X_train) @@ -434,6 +453,10 @@ def task_presence_classification( ).fit(X_train, y_train) ML_RESULT = { + "logistic regression": log_reg, + "logistic regression C": C, + "logistic regression train score": log_reg.score(X_train, y_train), + "logistic regression test score": log_reg.score(X_test, y_test), "pca": pca, "num_PCs": num_PCs, "cv_results": knn_gscv.cv_results_, @@ -442,6 +465,12 @@ def task_presence_classification( "KNN test score": neigh.score(X_test, y_test), } + print( + f"Logistic regression train score {measure_name} {task}: {log_reg.score(X_train, y_train)}" + ) + print( + f"Logistic regression test score {measure_name} {task}: {log_reg.score(X_test, y_test)}" + ) print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}") print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}") @@ -453,6 +482,7 @@ def task_presence_classification( "task": list(), "run": list(), "dFC method": list(), + "Logistic regression accuracy": list(), "KNN accuracy": list(), } for subj in SUBJECTS: @@ -466,9 +496,13 @@ def task_presence_classification( features = X_test[subj_label_test == subj, :] target = y_test[subj_label_test == subj] - pred = neigh.predict(features) + pred_lr = log_reg.predict(features) + pred_KNN = neigh.predict(features) - ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) + ML_scores["Logistic regression accuracy"].append( + balanced_accuracy_score(target, pred_lr) + ) + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) ML_scores["task"].append(task) ML_scores["run"].append(run) @@ -496,6 +530,7 @@ def run_classification( "task": list(), "run": list(), "dFC method": list(), + "Logistic regression accuracy": list(), "KNN accuracy": list(), } for dFC_id in range(0, 7): From 7fa88fb2d3d0ca69a2f54cec94060394790476d3 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 11 Jun 2024 22:34:02 -0400 Subject: [PATCH 038/401] fix file load manage --- task_dFC/ML.py | 10 ++-- task_dFC/dFC_assessment.py | 14 ++++-- task_dFC/nifti_to_roi_signal.py | 87 ++++++++++++++++++--------------- 3 files changed, 65 insertions(+), 46 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index bf50e27..8171048 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -30,16 +30,20 @@ def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None) ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") else: ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/") - ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file] + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file + ] if dFC_id is not None: ALL_DFC_FILES = [ dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file ] if run is not None: - ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if run in dFC_file] + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{run}_" in dFC_file + ] if session is not None: ALL_DFC_FILES = [ - dFC_file for dFC_file in ALL_DFC_FILES if session in dFC_file + dFC_file for dFC_file in ALL_DFC_FILES if f"_{session}_" in dFC_file ] ALL_DFC_FILES.sort() if len(ALL_DFC_FILES) > 0: diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py index 3b858a5..06253ac 100644 --- a/task_dFC/dFC_assessment.py +++ b/task_dFC/dFC_assessment.py @@ -71,12 +71,16 @@ def run_dFC_assess( ALL_ROI_FILES = [ roi_file for roi_file in ALL_ROI_FILES - if ("_time-series.npy" in roi_file) and (task in roi_file) + if ("_time-series.npy" in roi_file) and (f"_{task}_" in roi_file) ] if session is not None: - ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (session in roi_file)] + ALL_ROI_FILES = [ + roi_file for roi_file in ALL_ROI_FILES if (f"_{session}_" in roi_file) + ] if run is not None: - ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (run in roi_file)] + ALL_ROI_FILES = [ + roi_file for roi_file in ALL_ROI_FILES if (f"_{run}_" in roi_file) + ] ALL_ROI_FILES.sort() # if there are no files for this task, return @@ -91,7 +95,9 @@ def run_dFC_assess( ).item() ALL_RECORDS = os.listdir(f"{fitted_measures_dir}/") - ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)] + ALL_RECORDS = [ + i for i in ALL_RECORDS if ("MEASURE" in i) and (f"_{file_suffix}_" in i) + ] ALL_RECORDS.sort() MEASURES_fit_lst = list() for s in ALL_RECORDS: diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index fc54d9c..0d65049 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -2,6 +2,7 @@ import json import os import warnings +from re import A import numpy as np @@ -18,57 +19,44 @@ def run_roi_signal_extraction( fmriprep_root, bold_suffix, output_root, - session="", + session=None, + RUNS=[None], ): """ Extract ROI signals and task labels for a given subject and task and optionally session. """ + if session is None: + session_str = "" + else: + session_str = session # find the func file for this subject and task try: - if session == "": + if session is None: ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/") else: ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/{session}/func/") except FileNotFoundError: - print(f"Subject {subj} {session} not found in {fmriprep_root}") + print(f"Subject {subj} {session_str} not found in {fmriprep_root}") return ALL_TASK_FILES = [ file_i for file_i in ALL_TASK_FILES - if (bold_suffix in file_i) and (task in file_i) + if (bold_suffix in file_i) and (f"_{task}_" in file_i) ] # only keep the denoised files? or use the original files? if not len(ALL_TASK_FILES) >= 1: # if the func file is not found, exclude the subject - print(f"Func file not found for {subj} {session} {task}") + print(f"Func file not found for {subj} {session_str} {task}") return - # there might be multiple runs for the same task - # check if "_run" exists in all the task file names - if all(["_run" in task_file for task_file in ALL_TASK_FILES]): - multi_run_flag = True - # find all the runs - RUNS = [ - task_file[ - task_file.find("_run") - + 1 : task_file.find("_run") - + 1 - + task_file[task_file.find("_run") + 1 :].find("_") - ] - for task_file in ALL_TASK_FILES - ] - # sort - RUNS.sort() - print(f"Found multiple runs for {subj} {task}: {RUNS}") - else: - multi_run_flag = False - RUNS = [""] - for run in RUNS: - task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0] - if session == "": + if run is None: + task_file = ALL_TASK_FILES[0] + else: + task_file = [file_i for file_i in ALL_TASK_FILES if f"_{run}_" in file_i][0] + if session is None: nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}" task_events_root = f"{main_root}/bids/{subj}/func" else: @@ -107,15 +95,24 @@ def run_roi_signal_extraction( ALL_EVENTS_FILES = [ file_i for file_i in ALL_EVENTS_FILES - if (subj in file_i) - and (task in file_i) - and (run in file_i) - and (session in file_i) + if (f"{subj}_" in file_i) + and (f"_{task}_" in file_i) and ("events.tsv" in file_i) ] + if not run is None: + ALL_EVENTS_FILES = [ + file_i for file_i in ALL_EVENTS_FILES if f"_{run}_" in file_i + ] + if not session is None: + ALL_EVENTS_FILES = [ + file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i + ] if not len(ALL_EVENTS_FILES) == 1: # if the events file is not found, exclude the subject - print(f"Events file not found for {subj} {session} {task} {run}") + if run is None: + print(f"Events file not found for {subj} {session_str} {task}") + else: + print(f"Events file not found for {subj} {session_str} {task} {run}") return # load the tsv events file events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}" @@ -147,17 +144,17 @@ def run_roi_signal_extraction( "num_time_mri": num_time_mri, } - if session == "": + if session is None: subj_session_prefix = f"{subj}" output_dir = f"{output_root}/{subj}" else: subj_session_prefix = f"{subj}_{session}" output_dir = f"{output_root}/{subj}/{session}" - if multi_run_flag: - output_file_prefix = f"{subj_session_prefix}_{task}_{run}" - else: + if run is None: output_file_prefix = f"{subj_session_prefix}_{task}" + else: + output_file_prefix = f"{subj_session_prefix}_{task}_{run}" if not os.path.exists(f"{output_dir}/"): os.makedirs(f"{output_dir}/") @@ -192,9 +189,20 @@ def run_roi_signal_extraction( ) TASKS = dataset_info["TASKS"] - SESSIONS = dataset_info["SESSIONS"] + + if "SESSIONS" in dataset_info: + SESSIONS = dataset_info["SESSIONS"] + else: + SESSIONS = None if SESSIONS is None: - SESSIONS = [""] + SESSIONS = [None] + + if "RUNS" in dataset_info: + RUNS = dataset_info["RUNS"] + else: + RUNS = None + if RUNS is None: + RUNS = {task: [None] for task in TASKS} if "{dataset}" in dataset_info["main_root"]: main_root = dataset_info["main_root"].replace( @@ -223,6 +231,7 @@ def run_roi_signal_extraction( bold_suffix=dataset_info["bold_suffix"], output_root=output_root, session=session, + RUNS=RUNS[task], ) print( From 37ef96265e07ba95a861ac6b8f9b39febbac0194 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 12 Jun 2024 15:47:51 -0400 Subject: [PATCH 039/401] add clustering to ML --- task_dFC/ML.py | 172 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 8171048..88aa2a0 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -3,16 +3,17 @@ import os import numpy as np +from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression -from sklearn.metrics import balanced_accuracy_score +from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from pydfc import DFC, data_loader, task_utils -from pydfc.dfc_utils import dFC_mat2vec, rank_norm +from pydfc.dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm ####################################################################################### @@ -515,6 +516,103 @@ def task_presence_classification( return ML_RESULT, ML_scores +def task_presence_clustering( + task, + dFC_id, + roi_root, + dFC_root, + run=None, + session=None, + normalize_dFC=True, + explained_var_threshold=0.95, +): + if run is None: + print(f"=============== {task} ===============") + else: + print(f"=============== {task} {run} ===============") + + if task == "task-restingstate": + return + + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id + ) + + print(f"Number of subjects: {len(SUBJECTS)}") + + X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction( + task=task, + train_subjects=SUBJECTS, + test_subjects=[], + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred="no", + normalize_dFC=normalize_dFC, + ) + + # clustering + # apply kmeans clustering with PCA to dFC features + + n_clusters = 2 # corresponding to task and rest + + scaler = StandardScaler() + X_normalized = scaler.fit_transform(X) + # PCA + # find number of components that explain 95% of variance + pca = PCA() + pca.fit(X_normalized) + n_components = np.where( + np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold + )[0][0] + pca = PCA(n_components=n_components) + X_pca = pca.fit_transform(X_normalized) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=4) + labels_pred = kmeans.fit_predict(X_pca) + + # ARI score + print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") + + # visualize clustering centroids + centroids = kmeans.cluster_centers_ + centroids = pca.inverse_transform(centroids) + centroids = scaler.inverse_transform(centroids) + n_regions = (1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2 + centroids_mat = dFC_vec2mat(centroids, n_regions) + + clustering_RESULTS = { + "num_PCs": n_components, + "PCA": pca, + "kmeans": kmeans, + "ARI": adjusted_rand_score(y, labels_pred), + "centroids": centroids_mat, + } + + clustering_scores = { + "subj_id": list(), + "task": list(), + "run": list(), + "dFC method": list(), + "Kmeans ARI": list(), + } + for subj in SUBJECTS: + clustering_scores["subj_id"].append(subj) + features = X[subj_label == subj, :] + target = y[subj_label == subj] + + pred_KNN = kmeans.predict(features) + + clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_KNN)) + + clustering_scores["task"].append(task) + clustering_scores["run"].append(run) + clustering_scores["dFC method"].append(measure_name) + + return clustering_RESULTS, clustering_scores + + def run_classification( TASKS, RUNS, @@ -572,6 +670,61 @@ def run_classification( np.save(f"{folder}/ML_scores_classify.npy", ML_scores) +def run_clustering( + TASKS, + RUNS, + SESSIONS, + roi_root, + dFC_root, + output_root, + normalize_dFC=True, +): + for session in SESSIONS: + if not session is None: + print(f"=================== {session} ===================") + clustering_scores = { + "subj_id": list(), + "task": list(), + "run": list(), + "dFC method": list(), + "Kmeans ARI": list(), + } + for dFC_id in range(0, 7): + print(f"=================== dFC {dFC_id} ===================") + + clustering_RESULTS = {} + for task_id, task in enumerate(TASKS): + clustering_RESULTS[task] = {} + for run in RUNS[task]: + clustering_RESULTS_new, clustering_scores_new = ( + task_presence_clustering( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + normalize_dFC=normalize_dFC, + ) + ) + if run is None: + clustering_RESULTS[task] = clustering_RESULTS_new + else: + clustering_RESULTS[task][run] = clustering_RESULTS_new + for key in clustering_scores: + clustering_scores[key].extend(clustering_scores_new[key]) + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS) + + np.save(f"{folder}/clustering_scores.npy", clustering_scores) + + ####################################################################################### if __name__ == "__main__": @@ -638,6 +791,9 @@ def run_classification( roi_root=roi_root, output_root=ML_root, ) + + print("Task features extraction finished.") + print("Task presence classification started ...") run_classification( TASKS=TASKS, RUNS=RUNS, @@ -648,6 +804,18 @@ def run_classification( dynamic_pred="no", normalize_dFC=True, ) + print("Task presence classification finished.") + print("Task presence clustering started ...") + run_clustering( + TASKS=TASKS, + RUNS=RUNS, + SESSIONS=SESSIONS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + normalize_dFC=True, + ) + print("Task presence clustering finished.") print("Task presence prediction CODE finished running.") From 0a0842363dd277a2aea124e83ab9975935dd6c3d Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 13 Jun 2024 13:53:01 -0400 Subject: [PATCH 040/401] add bash run files --- task_dFC/run_scripts/dataset_info.json | 19 +++++++++ task_dFC/run_scripts/global_configs.json | 54 ++++++++++++++++++++++++ task_dFC/run_scripts/methods_config.json | 35 +++++++++++++++ task_dFC/run_scripts/run_FCS.sh | 19 +++++++++ task_dFC/run_scripts/run_ML.sh | 16 +++++++ task_dFC/run_scripts/run_dFC.sh | 24 +++++++++++ task_dFC/run_scripts/run_fmriprep.sh | 28 ++++++++++++ task_dFC/run_scripts/run_nifti_to_roi.sh | 24 +++++++++++ 8 files changed, 219 insertions(+) create mode 100644 task_dFC/run_scripts/dataset_info.json create mode 100644 task_dFC/run_scripts/global_configs.json create mode 100644 task_dFC/run_scripts/methods_config.json create mode 100644 task_dFC/run_scripts/run_FCS.sh create mode 100644 task_dFC/run_scripts/run_ML.sh create mode 100644 task_dFC/run_scripts/run_dFC.sh create mode 100644 task_dFC/run_scripts/run_fmriprep.sh create mode 100644 task_dFC/run_scripts/run_nifti_to_roi.sh diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts/dataset_info.json new file mode 100644 index 0000000..adfa42a --- /dev/null +++ b/task_dFC/run_scripts/dataset_info.json @@ -0,0 +1,19 @@ +{ + "dataset" : "", + "main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}", + "fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output", + "roi_root" : "{main_root}/derivatives/ROI_timeseries", + "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", + "dFC_root" : "{main_root}/derivatives/dFC_assessed", + "ML_root" : "{main_root}/derivatives/ML", + "bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz", + "SESSIONS" : [ + "ses-1" + ], + "TASKS" : [ + "task-A" + ], + "RUNS" : { + "task-A": ["run-01", "run-02", "run-03", "run-04", "run-05", "run-06"] + } +} diff --git a/task_dFC/run_scripts/global_configs.json b/task_dFC/run_scripts/global_configs.json new file mode 100644 index 0000000..ada5894 --- /dev/null +++ b/task_dFC/run_scripts/global_configs.json @@ -0,0 +1,54 @@ +{ + "DATASET_NAME": "", + "DATASET_ROOT": "/data/origami/dFC/DATA/task-based/openneuro//", + + "CONTAINER_STORE": "/data/origami/container_store/nipoppy/", + + "SINGULARITY_PATH": "singularity", + + "TEMPLATEFLOW_DIR": "/data/origami/templateflow", + + "SESSIONS": [], + "VISITS": [], + + "BIDS": { + "heudiconv": { + "VERSION": "0.11.6", + "CONTAINER": "heudiconv_{}.sif", + "URL": "" + }, + "validator":{ + "CONTAINER": "bids_validator.sif", + "URL": "" + + } + }, + + "PROC_PIPELINES": { + "mriqc": { + "VERSION": "23.1.0", + "CONTAINER": "mriqc_{}.sif", + "URL": "" + }, + "fmriprep": { + "VERSION": "23.1.3", + "CONTAINER": "fmriprep_{}.sif", + "URL": "" + }, + "freesurfer": { + "VERSION": "7.3.2", + "CONTAINER": "fmriprep_{}.sif", + "URL": "" + } + }, + + "TABULAR": { + "data_dictionary": { + "PATH": "", + "VERSION": "", + "URL": "" + } + }, + + "WORKFLOWS": [] +} diff --git a/task_dFC/run_scripts/methods_config.json b/task_dFC/run_scripts/methods_config.json new file mode 100644 index 0000000..d4013d4 --- /dev/null +++ b/task_dFC/run_scripts/methods_config.json @@ -0,0 +1,35 @@ +{ + "params_methods" : { + "W": 12, + "n_overlap": 1.0, + "sw_method": "pear_corr", + "tapered_window": true, + "TF_method": "WTC", + "clstr_base_measure": "SlidingWindow", + "hmm_iter": 20, + "dhmm_obs_state_ratio": 0.666, + "n_states": 5, + "n_subj_clstrs": 10, + "n_jobs": 2, + "verbose": 0, + "backend": "loky", + "normalization": true, + "num_subj": null, + "num_time_point": null + }, + "MEASURES_name_lst" : [ + "SlidingWindow", + "Time-Freq", + "CAP", + "ContinuousHMM", + "Windowless", + "Clustering", + "DiscreteHMM" + ], + "alter_hparams" : [], + "params_multi_analysis" : { + "n_jobs": null, + "verbose": 0, + "backend": "loky" + } +} diff --git a/task_dFC/run_scripts/run_FCS.sh b/task_dFC/run_scripts/run_FCS.sh new file mode 100644 index 0000000..fb22ed5 --- /dev/null +++ b/task_dFC/run_scripts/run_FCS.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/fcs_out.txt +#$ -e logs/fcs_err.txt +#$ -l h_vmem=64G +#$ -q origami.q +#$ -t 1-10 + +DATASET_INFO="./dataset_info.json" +METHODS_CONFIG="./methods_config.json" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \ +--dataset_info $DATASET_INFO \ +--methods_config $METHODS_CONFIG + +conda deactivate diff --git a/task_dFC/run_scripts/run_ML.sh b/task_dFC/run_scripts/run_ML.sh new file mode 100644 index 0000000..feaf0b0 --- /dev/null +++ b/task_dFC/run_scripts/run_ML.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/ML_out.txt +#$ -e logs/ML_err.txt +#$ -l h_vmem=32G +#$ -q origami.q + +DATASET_INFO="./dataset_info.json" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \ +--dataset_info $DATASET_INFO + +conda deactivate diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts/run_dFC.sh new file mode 100644 index 0000000..0683935 --- /dev/null +++ b/task_dFC/run_scripts/run_dFC.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/dfc_out.txt +#$ -e logs/dfc_err.txt +#$ -l h_vmem=32G +#$ -q origami.q +#$ -t 1-200 + +SUBJECT_LIST="./subj_list.txt" +DATASET_INFO="./dataset_info.json" + +echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" + +SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +echo "Subject ID: $SUBJECT_ID" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \ +--dataset_info $DATASET_INFO \ +--participant_id $SUBJECT_ID + +conda deactivate diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts/run_fmriprep.sh new file mode 100644 index 0000000..53dc89d --- /dev/null +++ b/task_dFC/run_scripts/run_fmriprep.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +#$ -cwd +#$ -o logs/fmriprep_out.log +#$ -e logs/fmriprep_err.log +#$ -l h_rt=24:00:00 +#$ -l h_vmem=32G +#$ -q origami.q + +#$ -t 1-122 + +# TODO replace with local paths +source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh" +conda activate nipoppy_env + +SUBJECT_LIST="./subj_list.txt" +GLOBAL_CONFIG="../proc/global_configs.json" + +echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" + +SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +echo "Subject ID: $SUBJECT_ID" + +python "/data/origami/dFC/CODEs/nipoppy/nipoppy/workflow/proc_pipe/fmriprep/run_fmriprep.py" \ +--global_config $GLOBAL_CONFIG \ +--participant_id $SUBJECT_ID + +conda deactivate diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts/run_nifti_to_roi.sh new file mode 100644 index 0000000..5f10f08 --- /dev/null +++ b/task_dFC/run_scripts/run_nifti_to_roi.sh @@ -0,0 +1,24 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/roi_out.txt +#$ -e logs/roi_err.txt +#$ -l h_vmem=32G +#$ -q origami.q +#$ -t 1-200 + +SUBJECT_LIST="./subj_list.txt" +DATASET_INFO="./dataset_info.json" + +echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" + +SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +echo "Subject ID: $SUBJECT_ID" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \ +--dataset_info $DATASET_INFO \ +--participant_id $SUBJECT_ID + +conda deactivate From 33d67fca24669abbecd2eb28fcb7f8d3fb483215 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 14 Jun 2024 14:23:24 -0400 Subject: [PATCH 041/401] minor fix --- task_dFC/ML.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 88aa2a0..9ac3f62 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -564,12 +564,13 @@ def task_presence_clustering( # find number of components that explain 95% of variance pca = PCA() pca.fit(X_normalized) - n_components = np.where( - np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold - )[0][0] + n_components = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + + 1 + ) pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X_normalized) - kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=4) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) labels_pred = kmeans.fit_predict(X_pca) # ARI score From b5591919366a67b9fd4b809d5aebea4ea23c0bf6 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 14 Jun 2024 14:34:37 -0400 Subject: [PATCH 042/401] minor change --- task_dFC/ML.py | 1 + 1 file changed, 1 insertion(+) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 9ac3f62..a8c9989 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -584,6 +584,7 @@ def task_presence_clustering( centroids_mat = dFC_vec2mat(centroids, n_regions) clustering_RESULTS = { + "StandardScaler": scaler, "num_PCs": n_components, "PCA": pca, "kmeans": kmeans, From 7e87cbf4b5f555ba33cfcd939128c3be6caea0a5 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 18 Jun 2024 22:38:14 -0400 Subject: [PATCH 043/401] minor fix --- task_dFC/ML.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index a8c9989..e5a52d6 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -414,13 +414,13 @@ def task_presence_classification( # logistic regression logistic_reg = make_pipeline(StandardScaler(), LogisticRegression()) # create a dictionary of all values we want to test for C - param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} + param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} # use gridsearch to test all values for C lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5) # fit model to data lr_gscv.fit(X_train, y_train) - C = lr_gscv.best_params_["C"] + C = lr_gscv.best_params_["logisticregression__C"] log_reg = make_pipeline( StandardScaler(), From ae98a4d1152d06c5606d85536b030bd5f2afaf88 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 16:48:49 -0400 Subject: [PATCH 044/401] add generate_report --- task_dFC/generate_report.py | 477 +++++++++++++++++++++++++ task_dFC/run_scripts/dataset_info.json | 1 + task_dFC/run_scripts/run_report.sh | 16 + 3 files changed, 494 insertions(+) create mode 100644 task_dFC/generate_report.py create mode 100644 task_dFC/run_scripts/run_report.sh diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py new file mode 100644 index 0000000..05169e4 --- /dev/null +++ b/task_dFC/generate_report.py @@ -0,0 +1,477 @@ +import argparse +import json +import os + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from nilearn import image, plotting +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + +from pydfc import DFC, data_loader, task_utils +from pydfc.dfc_utils import TR_intersection, dFC_mat2vec, dFC_vec2mat, rank_norm + +####################################################################################### + + +def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None): + """ + Load the dFC results for a given subject, task, dFC_id, run and session. + """ + if session is None: + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + + return dFC + + +def load_task_data(roi_root, subj, task, run=None, session=None): + """ + Load the task data for a given subject, task and run. + """ + if session is None: + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + + return task_data + + +# def plot_anatomical( +# fmriprep_root, +# subj, +# anat_suffix, +# session=None, +# ): +# anat_suffix = '_space-MNI152NLin2009cAsym_desc-preproc_T1w.nii.gz' +# anat_file = f"{fmriprep_root}/{subj}/anat/{subj}{anat_suffix}" +# display = plotting.plot_anat(anat_file, title="plot_anat") + + +# def plot_functional( +# fmriprep_root, +# subj, +# bold_suffix, +# task, +# session=None, +# run=None, +# ): +# if session is None: +# if run is None: +# task_file = f"{subj}_{task}{bold_suffix}" +# else: +# task_file = f"{subj}_{task}_{run}{bold_suffix}" +# func_file = f"{fmriprep_root}/{subj}/func/{task_file}" +# else: +# if run is None: +# task_file = f"{subj}_{session}_{task}{bold_suffix}" +# else: +# task_file = f"{subj}_{session}_{task}_{run}{bold_suffix}" +# func_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}" + +# # Compute voxel-wise mean functional image across time dimension. Now we have +# # functional image in 3D assigned in mean_func_img +# mean_func_img = image.mean_img(func_file) +# display = plotting.plot_anat(mean_func_img, title="plot_func") + + +def plot_roi_signals( + roi_root, + subj, + task, + session=None, + run=None, +): + if session is None: + if run is None: + file_name = "{subj_id}_{task}_time-series.npy" + else: + file_name = "{subj_id}_{task}_{run}_time-series.npy" + else: + if run is None: + file_name = "{subj_id}_{session}_{task}_time-series.npy" + else: + file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy" + + BOLD = data_loader.load_TS( + data_root=roi_root, + file_name=file_name, + subj_id2load=subj, + task=task, + run=run, + session=session, + ) + + BOLD.visualize(nodes_lst=list(range(0, 10)), save_image=False, output_root=None) + + +def plot_event_labels( + roi_root, + subj, + task, + run=None, + session=None, +): + task_data = load_task_data(roi_root, subj, task, run, session) + Fs_task = task_data["Fs_task"] + + time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task + plt.figure(figsize=(35, 4)) + plt.plot(time, task_data["event_labels"], linewidth=4) + plt.title("Event labels") + plt.xlabel("Time (s)") + plt.show() + + +def plot_task_presence( + roi_root, + subj, + task, + run=None, + session=None, +): + task_data = load_task_data(roi_root, subj, task, run, session) + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + TR_mri = task_data["TR_mri"] + Fs_mri = 1 / TR_mri + + task_presence_non_binarized = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=False, + ) + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + binarizing_method="mean", + ) + + time = np.arange(0, task_presence.shape[0]) / Fs_mri + plt.figure(figsize=(35, 4)) + plt.plot(time, task_presence_non_binarized, linewidth=4) + plt.plot(time, task_presence, linewidth=4) + # plot mean of task presence_non_binarized as a line + plt.plot(time, np.mean(task_presence_non_binarized) * np.ones_like(time), linewidth=4) + plt.title("Task presence") + plt.xlabel("Time (s)") + plt.show() + + +def calculate_subj_lvl_task_presence_characteristics( + roi_root, + subj, + task, + run=None, + session=None, +): + task_data = load_task_data(roi_root, subj, task, run, session) + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = task_utils.extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + binarizing_method="mean", + ) + relative_task_on = task_utils.relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = task_utils.task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = task_utils.rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = task_utils.transition_freq( + task_presence + ) + + print(f"Relative task on: {relative_task_on}") + print(f"Average task duration: {avg_task_duration} seconds") + print(f"Average rest duration: {avg_rest_duration} seconds") + print(f"Number of transitions: {num_of_transitions}") + print(f"Relative transition frequency: {relative_transition_freq}") + + +def plot_FCS(): + pass + + +def plot_dFC_matrices( + dFC_root, + subj, + task, + start_time, + end_time, + run=None, + session=None, +): + """ + plot dFC matrices for a given subject, task, run, session, start_time and end_time + parameters: + ---------- + dFC_root: str, path to dFC results + subj: str, subject id + task: str, task name + start_time: float, start time in seconds + end_time: float, end time in seconds + """ + task_data = load_task_data(roi_root, subj, task, run, session) + TR_mri = task_data["TR_mri"] + + dFC_lst = list() + for dFC_id in range(0, 20): # change this to the number of dFCs you have + try: + dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session) + dFC_lst.append(dFC) + except Exception: + pass + + TRs = TR_intersection(dFC_lst) + start_TR = int(start_time / TR_mri) + end_TR = int(end_time / TR_mri) + start_TR_idx = np.where(np.array(TRs) >= start_TR)[0][0] + end_TR_idx = np.where(np.array(TRs) <= end_TR)[0][-1] + chosen_TRs = TRs[start_TR_idx:end_TR_idx] + + for dFC in dFC_lst: + print(dFC.measure.measure_name) + dFC.visualize_dFC(TRs=chosen_TRs, normalize=False, rank_norm=True, fix_lim=False) + + +def plot_ML_results(ML_root, output_root, task, run=None, session=None): + if session is None: + ML_scores = np.load( + f"{ML_root}/ML_scores_classify.npy", allow_pickle="TRUE" + ).item() + else: + ML_scores = np.load( + f"{ML_root}/{session}/ML_scores_classify.npy", allow_pickle="TRUE" + ).item() + + sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) + + sns.set_style("darkgrid") + + dataframe = pd.DataFrame(ML_scores) + if run is not None: + dataframe = dataframe[dataframe["run"] == run] + + g = sns.pointplot( + data=dataframe[dataframe["task"] == task], + x="dFC method", + y="KNN accuracy", + hue="group", + errorbar="sd", + linestyle="none", + dodge=True, + capsize=0.1, + ) + g.axhline(0.5, color="r", linestyle="--") + g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) + + # save the figure + if session is None: + output_dir = f"{output_root}/group_results/classification" + else: + output_dir = f"{output_root}/group_results/classification/{session}" + + if run is None: + f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight") + else: + f.savefig(f"{output_dir}/ML_results_classify_{run}.png", bbox_inches="tight") + + +def plot_task_presence_characteristics(): + pass + + +def plot_clustering_results(): + pass + + +# def plot_dFC_clustering( +# dFC_root, +# subj, +# task, +# start_time, +# end_time, +# run=None, +# session=None, +# normalize_dFC=True, +# ): +# task_data = load_task_data(roi_root, subj, task, run, session) +# TR_mri = task_data['TR_mri'] + +# dFC_lst = list() +# for dFC_id in range(0, 20): # change this to the number of dFCs you have +# try: +# dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session) +# dFC_lst.append(dFC) +# except Exception: +# pass + +# for dFC in dFC_lst: +# dFC_mat = dFC.get_dFC_mat() +# TR_array = dFC.TR_array +# if normalize_dFC: +# dFC_mat = rank_norm(dFC_mat) +# dFC_vecs = dFC_mat2vec(dFC_mat) + +# # apply kmeans clustering with PCA to dFC vectors +# n_clusters = 2 + +# scaler = StandardScaler() +# dFC_vecs = scaler.fit_transform(dFC_vecs) +# # PCA +# # find number of components that explain 95% of variance +# pca = PCA() +# pca.fit(dFC_vecs) +# n_components = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1 +# # print(f"Number of components: {n_components}") +# pca = PCA(n_components=n_components) +# pca.fit(dFC_vecs) + + +# dFC_vecs_pca = pca.transform(dFC_vecs) +# kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=20) +# labels_pred = kmeans.fit_predict(dFC_vecs_pca) + +# start_TR = int(start_time/TR_mri) +# end_TR = int(end_time/TR_mri) +# start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0] +# end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] + +# # plot labels_pred +# plt.figure(figsize=(35, 2)) +# plt.plot(time[start_TR:end_TR], labels_pred[start_TR_idx:end_TR_idx], linewidth=4) +# # put vertical lines at the start of each TR +# for TR in chosen_TRs: +# plt.axvline(x=TR*TR_mri, color='r', linestyle='--') +# # plt.text(TR*TR_mri, 0.5, f"TR {TR}", fontsize=8, color='black', ha='center') +# plt.title(f"Cluster labels of {dFC.measure.measure_name}") +# plt.xlabel('Time (s)') +# plt.show() + +####################################################################################### +if __name__ == "__main__": + # argparse + HELPTEXT = """ + Script to generate a report of subject results. + """ + + parser = argparse.ArgumentParser(description=HELPTEXT) + + parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + + args = parser.parse_args() + + dataset_info_file = args.dataset_info + + # Read global configs + with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + + TASKS = dataset_info["TASKS"] + if "RUNS" in dataset_info: + RUNS = dataset_info["RUNS"] + else: + RUNS = None + if RUNS is None: + RUNS = {task: [None] for task in TASKS} + + if "SESSIONS" in dataset_info: + SESSIONS = dataset_info["SESSIONS"] + else: + SESSIONS = None + if SESSIONS is None: + SESSIONS = [None] + + if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace( + "{dataset}", dataset_info["dataset"] + ) + else: + main_root = dataset_info["main_root"] + + if "{main_root}" in dataset_info["roi_root"]: + roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) + else: + roi_root = dataset_info["roi_root"] + + if "{main_root}" in dataset_info["dFC_root"]: + dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root) + else: + dFC_root = dataset_info["dFC_root"] + + if "{main_root}" in dataset_info["ML_root"]: + ML_root = dataset_info["ML_root"].replace("{main_root}", main_root) + else: + ML_root = dataset_info["ML_root"] + + if "{main_root}" in dataset_info["reports_root"]: + figures_root = dataset_info["reports_root"].replace("{main_root}", main_root) + else: + figures_root = dataset_info["reports_root"] + + print("Generating report...") + + for session in SESSIONS: + for task in TASKS: + for run in RUNS[task]: + plot_ML_results( + ML_root=ML_root, + output_root=figures_root, + task=task, + run=run, + session=session, + ) + + print("Report generated successfully!") + +####################################################################################### diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts/dataset_info.json index adfa42a..8296d5b 100644 --- a/task_dFC/run_scripts/dataset_info.json +++ b/task_dFC/run_scripts/dataset_info.json @@ -6,6 +6,7 @@ "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", "dFC_root" : "{main_root}/derivatives/dFC_assessed", "ML_root" : "{main_root}/derivatives/ML", + "reports_root" : "{main_root}/derivatives/reports", "bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz", "SESSIONS" : [ "ses-1" diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts/run_report.sh new file mode 100644 index 0000000..0b71969 --- /dev/null +++ b/task_dFC/run_scripts/run_report.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/report_out.txt +#$ -e logs/report_err.txt +#$ -l h_vmem=16G +#$ -q origami.q + +DATASET_INFO="./dataset_info.json" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \ +--dataset_info $DATASET_INFO + +conda deactivate From e919d2d88f9c3aac7f58560d646b88ccefbca127 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 17:31:53 -0400 Subject: [PATCH 045/401] minor fix --- task_dFC/generate_report.py | 1 + 1 file changed, 1 insertion(+) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 05169e4..5aef3ed 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -321,6 +321,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): else: output_dir = f"{output_root}/group_results/classification/{session}" + f = g.get_figure() if run is None: f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight") else: From 811fe67caa582f68e040f31688355439521a6566 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 17:38:41 -0400 Subject: [PATCH 046/401] minor fix --- task_dFC/generate_report.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 5aef3ed..c377427 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -321,6 +321,9 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): else: output_dir = f"{output_root}/group_results/classification/{session}" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + f = g.get_figure() if run is None: f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight") From fc17618447a0ab9cbd46eb185530c83bc98a3027 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 17:48:35 -0400 Subject: [PATCH 047/401] minor fix --- task_dFC/generate_report.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index c377427..7f7f8f2 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -326,9 +326,11 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): f = g.get_figure() if run is None: - f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight") + f.savefig(f"{output_dir}/ML_results_classify_{task}.png", bbox_inches="tight") else: - f.savefig(f"{output_dir}/ML_results_classify_{run}.png", bbox_inches="tight") + f.savefig( + f"{output_dir}/ML_results_classify_{task}_{run}.png", bbox_inches="tight" + ) def plot_task_presence_characteristics(): From 5c5d2b0b7b72c5a83fcf294bb1d14047f3fd8808 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 18:29:25 -0400 Subject: [PATCH 048/401] minor fix in saving figures --- task_dFC/generate_report.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 7f7f8f2..c3e6484 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -14,6 +14,14 @@ from pydfc import DFC, data_loader, task_utils from pydfc.dfc_utils import TR_intersection, dFC_mat2vec, dFC_vec2mat, rank_norm +################################# Parameters #################################### + +fig_dpi = 120 +fig_bbox_inches = "tight" +fig_pad = 0.1 +show_title = True +save_fig_format = "png" # pdf, png, + ####################################################################################### @@ -302,6 +310,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): if run is not None: dataframe = dataframe[dataframe["run"] == run] + plt.figure(figsize=(10, 5)) g = sns.pointplot( data=dataframe[dataframe["task"] == task], x="dFC method", @@ -313,7 +322,8 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): capsize=0.1, ) g.axhline(0.5, color="r", linestyle="--") - g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) + if show_title: + g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) # save the figure if session is None: @@ -324,14 +334,25 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): if not os.path.exists(output_dir): os.makedirs(output_dir) - f = g.get_figure() if run is None: - f.savefig(f"{output_dir}/ML_results_classify_{task}.png", bbox_inches="tight") + plt.savefig( + f"{output_dir}/ML_results_classify_{task}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) else: - f.savefig( - f"{output_dir}/ML_results_classify_{task}_{run}.png", bbox_inches="tight" + plt.savefig( + f"{output_dir}/ML_results_classify_{task}_{run}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, ) + plt.close() + def plot_task_presence_characteristics(): pass From 1a959e8b4ba80d68820b0b72b575aa5b99753131 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 18:38:32 -0400 Subject: [PATCH 049/401] add logreg to report --- task_dFC/generate_report.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index c3e6484..09896eb 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -292,7 +292,20 @@ def plot_dFC_matrices( dFC.visualize_dFC(TRs=chosen_TRs, normalize=False, rank_norm=True, fix_lim=False) -def plot_ML_results(ML_root, output_root, task, run=None, session=None): +def plot_ML_results( + ML_root, output_root, task, run=None, session=None, ML_algorithm="KNN" +): + """ + Plot the ML results for a given task, run and session. + parameters: + ---------- + ML_root: str, path to ML results + output_root: str, path to save the figures + task: str, task name + run: int, run number + session: str, session name + ML_algorithm: str, ML algorithm name (default: KNN, other options: Logistic regression) + """ if session is None: ML_scores = np.load( f"{ML_root}/ML_scores_classify.npy", allow_pickle="TRUE" @@ -314,7 +327,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): g = sns.pointplot( data=dataframe[dataframe["task"] == task], x="dFC method", - y="KNN accuracy", + y=f"{ML_algorithm} accuracy", hue="group", errorbar="sd", linestyle="none", @@ -334,9 +347,14 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): if not os.path.exists(output_dir): os.makedirs(output_dir) + if ML_algorithm == "Logistic regression": + ML_algorithm_name = "LogReg" + elif ML_algorithm == "KNN": + ML_algorithm_name = "KNN" + if run is None: plt.savefig( - f"{output_dir}/ML_results_classify_{task}.{save_fig_format}", + f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -344,7 +362,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None): ) else: plt.savefig( - f"{output_dir}/ML_results_classify_{task}_{run}.{save_fig_format}", + f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -497,6 +515,15 @@ def plot_clustering_results(): task=task, run=run, session=session, + ML_algorithm="KNN", + ) + plot_ML_results( + ML_root=ML_root, + output_root=figures_root, + task=task, + run=run, + session=session, + ML_algorithm="Logistic regression", ) print("Report generated successfully!") From d10787682b391a6ccdd3a29a02a7d66037275998 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 18:40:43 -0400 Subject: [PATCH 050/401] minor fix --- task_dFC/generate_report.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 09896eb..8ce31b9 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -517,14 +517,14 @@ def plot_clustering_results(): session=session, ML_algorithm="KNN", ) - plot_ML_results( - ML_root=ML_root, - output_root=figures_root, - task=task, - run=run, - session=session, - ML_algorithm="Logistic regression", - ) + # plot_ML_results( + # ML_root=ML_root, + # output_root=figures_root, + # task=task, + # run=run, + # session=session, + # ML_algorithm="Logistic regression", + # ) print("Report generated successfully!") From e76001a976e5d43b3f46fd847667b0f2c09b784a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 20:41:31 -0400 Subject: [PATCH 051/401] add dFC to reports --- task_dFC/generate_report.py | 50 +++++++++++++++++++++++++----- task_dFC/run_scripts/run_report.sh | 4 ++- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 8ce31b9..3942542 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -256,6 +256,7 @@ def plot_dFC_matrices( task, start_time, end_time, + output_root, run=None, session=None, ): @@ -287,9 +288,23 @@ def plot_dFC_matrices( end_TR_idx = np.where(np.array(TRs) <= end_TR)[0][-1] chosen_TRs = TRs[start_TR_idx:end_TR_idx] + output_dir = f"{output_root}/subject_results/{subj}/dFC_matrices" + if session is not None: + output_dir = f"{output_dir}/{session}" + output_dir = f"{output_dir}/{task}" + if run is not None: + output_dir = f"{output_dir}/{run}" + output_dir = f"{output_dir}/" + for dFC in dFC_lst: - print(dFC.measure.measure_name) - dFC.visualize_dFC(TRs=chosen_TRs, normalize=False, rank_norm=True, fix_lim=False) + dFC.visualize_dFC( + TRs=chosen_TRs, + normalize=False, + rank_norm=True, + fix_lim=False, + save_image=True, + output_root=output_dir, + ) def plot_ML_results( @@ -453,15 +468,21 @@ def plot_clustering_results(): parser = argparse.ArgumentParser(description=HELPTEXT) parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + parser.add_argument("--subj_list", type=str, help="path to subject list file") args = parser.parse_args() dataset_info_file = args.dataset_info + subj_list_file = args.subj_list - # Read global configs + # Read dataset info with open(dataset_info_file, "r") as f: dataset_info = json.load(f) + # Read subject list file, a txt file with one subject id per line + with open(subj_list_file, "r") as f: + SUBJECTS = f.read().splitlines() + TASKS = dataset_info["TASKS"] if "RUNS" in dataset_info: RUNS = dataset_info["RUNS"] @@ -500,18 +521,33 @@ def plot_clustering_results(): ML_root = dataset_info["ML_root"] if "{main_root}" in dataset_info["reports_root"]: - figures_root = dataset_info["reports_root"].replace("{main_root}", main_root) + reports_root = dataset_info["reports_root"].replace("{main_root}", main_root) else: - figures_root = dataset_info["reports_root"] + reports_root = dataset_info["reports_root"] print("Generating report...") + for subj in SUBJECTS: + for session in SESSIONS: + for task in TASKS: + for run in RUNS[task]: + plot_dFC_matrices( + dFC_root=dFC_root, + subj=subj, + task=task, + start_time=50, + end_time=150, + output_root=reports_root, + run=run, + session=session, + ) + for session in SESSIONS: for task in TASKS: for run in RUNS[task]: plot_ML_results( ML_root=ML_root, - output_root=figures_root, + output_root=reports_root, task=task, run=run, session=session, @@ -519,7 +555,7 @@ def plot_clustering_results(): ) # plot_ML_results( # ML_root=ML_root, - # output_root=figures_root, + # output_root=reports_root, # task=task, # run=run, # session=session, diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts/run_report.sh index 0b71969..1734316 100644 --- a/task_dFC/run_scripts/run_report.sh +++ b/task_dFC/run_scripts/run_report.sh @@ -7,10 +7,12 @@ #$ -q origami.q DATASET_INFO="./dataset_info.json" +SUBJ_LIST="./subj_list.txt" source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh conda activate pydfc python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \ ---dataset_info $DATASET_INFO +--dataset_info $DATASET_INFO, \ +--subj_list $SUBJ_LIST conda deactivate From 2c2dd957bc76810193359df84783f45499494157 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 20:49:36 -0400 Subject: [PATCH 052/401] minor bug --- task_dFC/run_scripts/run_report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts/run_report.sh index 1734316..2a00cc5 100644 --- a/task_dFC/run_scripts/run_report.sh +++ b/task_dFC/run_scripts/run_report.sh @@ -12,7 +12,7 @@ SUBJ_LIST="./subj_list.txt" source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh conda activate pydfc python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \ ---dataset_info $DATASET_INFO, \ +--dataset_info $DATASET_INFO \ --subj_list $SUBJ_LIST conda deactivate From 5a20c2d9e3cec7599955018c77f03720e92d5e7e Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 23:15:21 -0400 Subject: [PATCH 053/401] add visuals to report --- task_dFC/generate_report.py | 163 ++++++++++++++++++++++++++++++++---- 1 file changed, 148 insertions(+), 15 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 3942542..0770aef 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -124,6 +124,10 @@ def plot_roi_signals( roi_root, subj, task, + start_time, + end_time, + output_root, + nodes_list=range(0, 10), session=None, run=None, ): @@ -138,6 +142,9 @@ def plot_roi_signals( else: file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy" + task_data = load_task_data(roi_root, subj, task, run, session) + TR_mri = task_data["TR_mri"] + BOLD = data_loader.load_TS( data_root=roi_root, file_name=file_name, @@ -147,13 +154,47 @@ def plot_roi_signals( session=session, ) - BOLD.visualize(nodes_lst=list(range(0, 10)), save_image=False, output_root=None) + time = np.arange(0, BOLD.data.shape[1]) * TR_mri + start_time = 200 + end_time = 300 + start_TR = int(start_time / TR_mri) + end_TR = int(end_time / TR_mri) + fig_width = (start_time - end_time) / 5 + plt.figure(figsize=(fig_width, 3)) + for i in nodes_list: + plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) + if show_title: + plt.title("ROI signals") + plt.xlabel("Time (s)") + + # save the figure + output_dir = f"{output_root}/subject_results/{subj}/ROI_signals" + if session is not None: + output_dir = f"{output_dir}/{session}" + output_dir = f"{output_dir}/{task}" + if run is not None: + output_dir = f"{output_dir}/{run}" + output_dir = f"{output_dir}/" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + plt.savefig( + f"{output_dir}/ROI_signals.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() def plot_event_labels( roi_root, subj, task, + output_root, run=None, session=None, ): @@ -165,13 +206,35 @@ def plot_event_labels( plt.plot(time, task_data["event_labels"], linewidth=4) plt.title("Event labels") plt.xlabel("Time (s)") - plt.show() + + # save the figure + output_dir = f"{output_root}/subject_results/{subj}/event_labels" + if session is not None: + output_dir = f"{output_dir}/{session}" + output_dir = f"{output_dir}/{task}" + if run is not None: + output_dir = f"{output_dir}/{run}" + output_dir = f"{output_dir}/" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + plt.savefig( + f"{output_dir}/event_labels.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() def plot_task_presence( roi_root, subj, task, + output_root, run=None, session=None, ): @@ -204,7 +267,28 @@ def plot_task_presence( plt.plot(time, np.mean(task_presence_non_binarized) * np.ones_like(time), linewidth=4) plt.title("Task presence") plt.xlabel("Time (s)") - plt.show() + + # save the figure + output_dir = f"{output_root}/subject_results/{subj}/task_presence" + if session is not None: + output_dir = f"{output_dir}/{session}" + output_dir = f"{output_dir}/{task}" + if run is not None: + output_dir = f"{output_dir}/{run}" + output_dir = f"{output_dir}/" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + plt.savefig( + f"{output_dir}/task_presence.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() def calculate_subj_lvl_task_presence_characteristics( @@ -246,8 +330,14 @@ def calculate_subj_lvl_task_presence_characteristics( print(f"Relative transition frequency: {relative_transition_freq}") -def plot_FCS(): - pass +# def plot_FCS(): +# visualize_FCS( +# measure, +# normalize=True, +# fix_lim=False, +# save_image=save_image, +# output_root=output_root + "FCS/", +# ) def plot_dFC_matrices( @@ -531,16 +621,59 @@ def plot_clustering_results(): for session in SESSIONS: for task in TASKS: for run in RUNS[task]: - plot_dFC_matrices( - dFC_root=dFC_root, - subj=subj, - task=task, - start_time=50, - end_time=150, - output_root=reports_root, - run=run, - session=session, - ) + + try: + plot_dFC_matrices( + dFC_root=dFC_root, + subj=subj, + task=task, + start_time=50, + end_time=150, + output_root=reports_root, + run=run, + session=session, + ) + except Exception as e: + print(f"Error in plotting dFC matrices: {e}") + + try: + plot_roi_signals( + roi_root=roi_root, + subj=subj, + task=task, + start_time=50, + end_time=150, + nodes_list=range(0, 10), + output_root=reports_root, + run=run, + session=session, + ) + except Exception as e: + print(f"Error in plotting ROI signals: {e}") + + try: + plot_event_labels( + roi_root=roi_root, + subj=subj, + task=task, + output_root=reports_root, + run=run, + session=session, + ) + except Exception as e: + print(f"Error in plotting event labels: {e}") + + try: + plot_task_presence( + roi_root=roi_root, + subj=subj, + task=task, + output_root=reports_root, + run=run, + session=session, + ) + except Exception as e: + print(f"Error in plotting task presence: {e}") for session in SESSIONS: for task in TASKS: From 243976ad70f084b696b05676b89bdf5ebef69414 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 23:23:52 -0400 Subject: [PATCH 054/401] minor fix --- task_dFC/ML.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index e5a52d6..e275f7d 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -339,7 +339,7 @@ def dFC_feature_extraction( dFC_measure_name == dFC.measure.measure_name ), "dFC measure is not consistent." - print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) + # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) subj_label_train = np.array(subj_label_train) subj_label_test = np.array(subj_label_test) @@ -743,7 +743,7 @@ def run_clustering( dataset_info_file = args.dataset_info - # Read global configs + # Read dataset info with open(dataset_info_file, "r") as f: dataset_info = json.load(f) From 7ca2d4daa0983441a0da4acb0e5826956b993c25 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 20 Jun 2024 23:27:58 -0400 Subject: [PATCH 055/401] test --- task_dFC/ML.py | 3 +++ task_dFC/run_scripts/run_ML.sh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index e275f7d..c495d2e 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -1,6 +1,7 @@ import argparse import json import os +from re import S import numpy as np from sklearn.cluster import KMeans @@ -381,6 +382,7 @@ def task_presence_classification( SUBJECTS = find_available_subjects( dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id ) + SUBJECTS = SUBJECTS[:20] # randomly select train_test_ratio of the subjects for training # and rest for testing using numpy.random.choice @@ -537,6 +539,7 @@ def task_presence_clustering( SUBJECTS = find_available_subjects( dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id ) + SUBJECTS = SUBJECTS[:20] print(f"Number of subjects: {len(SUBJECTS)}") diff --git a/task_dFC/run_scripts/run_ML.sh b/task_dFC/run_scripts/run_ML.sh index feaf0b0..4ec431a 100644 --- a/task_dFC/run_scripts/run_ML.sh +++ b/task_dFC/run_scripts/run_ML.sh @@ -3,7 +3,7 @@ #$ -cwd #$ -o logs/ML_out.txt #$ -e logs/ML_err.txt -#$ -l h_vmem=32G +#$ -l h_vmem=64G #$ -q origami.q DATASET_INFO="./dataset_info.json" From c36a03fd94557921f6a2abcc0395b960b0b1c185 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 11:50:10 -0400 Subject: [PATCH 056/401] minor fix --- task_dFC/ML.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index c495d2e..7af39ce 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -583,7 +583,7 @@ def task_presence_clustering( centroids = kmeans.cluster_centers_ centroids = pca.inverse_transform(centroids) centroids = scaler.inverse_transform(centroids) - n_regions = (1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2 + n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) centroids_mat = dFC_vec2mat(centroids, n_regions) clustering_RESULTS = { From e11a1f06cf7a7b508bca74634c03493458de5fc4 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 12:04:17 -0400 Subject: [PATCH 057/401] minor fix --- task_dFC/generate_report.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 0770aef..c6827b9 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -159,7 +159,7 @@ def plot_roi_signals( end_time = 300 start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) - fig_width = (start_time - end_time) / 5 + fig_width = (end_time - start_time) / 5 plt.figure(figsize=(fig_width, 3)) for i in nodes_list: plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) @@ -617,6 +617,9 @@ def plot_clustering_results(): print("Generating report...") + # Generate report only for 5 random subjects + SUBJECTS = np.random.choice(SUBJECTS, 5) + for subj in SUBJECTS: for session in SESSIONS: for task in TASKS: From fe1184b3159a9b910d90e3eb83aef33d32e9bc65 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 16:01:38 -0400 Subject: [PATCH 058/401] create html report --- task_dFC/generate_report.py | 64 +++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index c6827b9..9cbcb9a 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -548,6 +548,61 @@ def plot_clustering_results(): # plt.xlabel('Time (s)') # plt.show() + +def create_html_report( + subj, + reports_root, +): + """ + This function creates an html report for the subject results + using the generated figures. + """ + # create html report + subj_dir = f"{reports_root}/subject_results/{subj}" + file = open(f"{subj_dir}/report.html", "w") + file.write("\n") + file.write("\n") + file.write("Subject results\n") + file.write("\n") + file.write("\n") + file.write("

Subject results

\n") + for session in SESSIONS: + if session is not None: + file.write(f"

{session}

\n") + for task in TASKS: + file.write(f"

{task}

\n") + for run in RUNS[task]: + if run is not None: + file.write(f"

{run}

\n") + session_task_run_dir = f"{subj_dir}" + if session is not None: + session_task_run_dir = f"{session_task_run_dir}/{session}" + session_task_run_dir = f"{session_task_run_dir}/{task}" + if run is not None: + session_task_run_dir = f"{session_task_run_dir}/{run}" + + file.write( + f"ROI signals\n" + ) + file.write( + f"Event labels\n" + ) + file.write( + f"Task presence\n" + ) + # for dFC matrices find all png files in the directory + dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}" + if os.path.exists(dFC_matrices_dir): + for file_name in os.listdir(dFC_matrices_dir): + if file_name.endswith(".png"): + file.write( + f"{file_name}\n" + ) + file.write("\n") + file.write("\n") + file.close() + + ####################################################################################### if __name__ == "__main__": # argparse @@ -617,8 +672,8 @@ def plot_clustering_results(): print("Generating report...") - # Generate report only for 5 random subjects - SUBJECTS = np.random.choice(SUBJECTS, 5) + # Generate report only one random subjects + SUBJECTS = np.random.choice(SUBJECTS, 1) for subj in SUBJECTS: for session in SESSIONS: @@ -677,6 +732,11 @@ def plot_clustering_results(): ) except Exception as e: print(f"Error in plotting task presence: {e}") + # create html report + try: + create_html_report(subj, reports_root) + except Exception as e: + print(f"Error in creating html report: {e}") for session in SESSIONS: for task in TASKS: From 41978448ceb6a223eba68a6e80136929039eadd4 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 18:09:49 -0400 Subject: [PATCH 059/401] minor bug in ML --- task_dFC/ML.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 7af39ce..3ff4dec 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -607,9 +607,11 @@ def task_presence_clustering( features = X[subj_label == subj, :] target = y[subj_label == subj] - pred_KNN = kmeans.predict(features) + features_normalized = scaler.transform(features) + features_pca = pca.transform(features_normalized) + pred_kmeans = kmeans.predict(features_pca) - clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_KNN)) + clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans)) clustering_scores["task"].append(task) clustering_scores["run"].append(run) From a3dcf8b41d96bb5e151762f5f8971696f5607f93 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 18:26:11 -0400 Subject: [PATCH 060/401] adjust fig size in report --- task_dFC/generate_report.py | 91 ++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 9cbcb9a..f4d155c 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -155,14 +155,11 @@ def plot_roi_signals( ) time = np.arange(0, BOLD.data.shape[1]) * TR_mri - start_time = 200 - end_time = 300 - start_TR = int(start_time / TR_mri) - end_TR = int(end_time / TR_mri) - fig_width = (end_time - start_time) / 5 - plt.figure(figsize=(fig_width, 3)) + # keep the figure width proportional to the number of time points in data + fig_width = int(2.5 * task_data["num_time_mri"]) + plt.figure(figsize=(fig_width, 5)) for i in nodes_list: - plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) + plt.plot(time, BOLD.data[i, :], linewidth=4) if show_title: plt.title("ROI signals") plt.xlabel("Time (s)") @@ -202,7 +199,9 @@ def plot_event_labels( Fs_task = task_data["Fs_task"] time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task - plt.figure(figsize=(35, 4)) + # keep the figure width proportional to the number of time points in data + fig_width = int(2.5 * task_data["num_time_mri"]) + plt.figure(figsize=(fig_width, 5)) plt.plot(time, task_data["event_labels"], linewidth=4) plt.title("Event labels") plt.xlabel("Time (s)") @@ -260,7 +259,9 @@ def plot_task_presence( ) time = np.arange(0, task_presence.shape[0]) / Fs_mri - plt.figure(figsize=(35, 4)) + # keep the figure width proportional to the number of time points in data + fig_width = int(2.5 * task_data["num_time_mri"]) + plt.figure(figsize=(fig_width, 5)) plt.plot(time, task_presence_non_binarized, linewidth=4) plt.plot(time, task_presence, linewidth=4) # plot mean of task presence_non_binarized as a line @@ -551,53 +552,95 @@ def plot_clustering_results(): def create_html_report( subj, + SESSIONS, + TASKS, + RUNS, reports_root, ): """ This function creates an html report for the subject results using the generated figures. """ + img_height = 150 # create html report subj_dir = f"{reports_root}/subject_results/{subj}" file = open(f"{subj_dir}/report.html", "w") file.write("\n") file.write("\n") - file.write("Subject results\n") + file.write(f"Subject {subj} Results\n") file.write("\n") file.write("\n") - file.write("

Subject results

\n") + file.write(f"

Subject {subj} Results

\n") for session in SESSIONS: if session is not None: - file.write(f"

{session}

\n") + file.write(f"

{session}

\n") for task in TASKS: - file.write(f"

{task}

\n") + file.write(f"

{task}

\n") for run in RUNS[task]: if run is not None: file.write(f"

{run}

\n") - session_task_run_dir = f"{subj_dir}" if session is not None: - session_task_run_dir = f"{session_task_run_dir}/{session}" - session_task_run_dir = f"{session_task_run_dir}/{task}" + session_task_run_dir = f"{session}/{task}" + else: + session_task_run_dir = f"{task}" if run is not None: session_task_run_dir = f"{session_task_run_dir}/{run}" + # display ROI signals + ROI_signals_img = ( + f"{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png" + ) + img = plt.imread(ROI_signals_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) file.write( - f"ROI signals\n" + f"ROI signals\n" + ) + file.write("
\n") + + # display event labels + event_labels_img = ( + f"{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png" ) + img = plt.imread(event_labels_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) file.write( - f"Event labels\n" + f"Event labels\n" + ) + file.write("
\n") + + # display task presence + task_presence_img = ( + f"{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png" ) + img = plt.imread(task_presence_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) file.write( - f"Task presence\n" + f"Task presence\n" ) + file.write("
\n") + + # display dFC matrices # for dFC matrices find all png files in the directory dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}" if os.path.exists(dFC_matrices_dir): for file_name in os.listdir(dFC_matrices_dir): if file_name.endswith(".png"): + file.write(f"

{file_name[:file_name.find('_dFC')]}

\n") + # get the original size of the image + img = plt.imread(f"{dFC_matrices_dir}/{file_name}") + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) file.write( - f"{file_name}\n" + f"{file_name}\n" ) + file.write("
\n") file.write("\n") file.write("\n") file.close() @@ -734,7 +777,13 @@ def create_html_report( print(f"Error in plotting task presence: {e}") # create html report try: - create_html_report(subj, reports_root) + create_html_report( + subj=subj, + SESSIONS=SESSIONS, + TASKS=TASKS, + RUNS=RUNS, + reports_root=reports_root, + ) except Exception as e: print(f"Error in creating html report: {e}") From b34867bbd5d48ccd8a1ebb8cc8d789d98f8a09bf Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 19:55:17 -0400 Subject: [PATCH 061/401] set time interval for report --- task_dFC/generate_report.py | 70 +++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index f4d155c..77e316a 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -1,6 +1,7 @@ import argparse import json import os +from tracemalloc import start import matplotlib.pyplot as plt import numpy as np @@ -155,11 +156,19 @@ def plot_roi_signals( ) time = np.arange(0, BOLD.data.shape[1]) * TR_mri - # keep the figure width proportional to the number of time points in data - fig_width = int(2.5 * task_data["num_time_mri"]) + start_TR = int(start_time / TR_mri) + end_TR = int(end_time / TR_mri) + # keep the figure width proportional to the number of time points + fig_width = int(2.5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) for i in nodes_list: - plt.plot(time, BOLD.data[i, :], linewidth=4) + plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) + # put vertical lines at the start of each TR + for TR in range(start_TR, end_TR): + plt.axvline(x=TR * TR_mri, color="r", linestyle="--") + # show TR labels on the red lines with a small font and at the top + for TR in range(start_TR, end_TR): + plt.text(TR * TR_mri, 1.2, f"TR {TR}", fontsize=8, color="black", ha="center") if show_title: plt.title("ROI signals") plt.xlabel("Time (s)") @@ -191,18 +200,28 @@ def plot_event_labels( roi_root, subj, task, + start_time, + end_time, output_root, run=None, session=None, ): task_data = load_task_data(roi_root, subj, task, run, session) Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + TR_mri = task_data["TR_mri"] time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task - # keep the figure width proportional to the number of time points in data - fig_width = int(2.5 * task_data["num_time_mri"]) + start_timepoint = int(start_time / TR_task) + end_timepoint = int(end_time / TR_task) + # keep the figure width proportional to the number of time points + fig_width = int(2.5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) - plt.plot(time, task_data["event_labels"], linewidth=4) + plt.plot( + time[start_timepoint:end_timepoint], + task_data["event_labels"][start_timepoint:end_timepoint], + linewidth=4, + ) plt.title("Event labels") plt.xlabel("Time (s)") @@ -233,6 +252,8 @@ def plot_task_presence( roi_root, subj, task, + start_time, + end_time, output_root, run=None, session=None, @@ -259,13 +280,27 @@ def plot_task_presence( ) time = np.arange(0, task_presence.shape[0]) / Fs_mri + start_TR = int(start_time / TR_mri) + end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points in data - fig_width = int(2.5 * task_data["num_time_mri"]) + fig_width = int(2.5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) - plt.plot(time, task_presence_non_binarized, linewidth=4) - plt.plot(time, task_presence, linewidth=4) + plt.plot( + time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4 + ) + plt.plot(time[start_TR:end_TR], task_presence[start_TR:end_TR], linewidth=4) # plot mean of task presence_non_binarized as a line - plt.plot(time, np.mean(task_presence_non_binarized) * np.ones_like(time), linewidth=4) + plt.plot( + time[start_TR:end_TR], + np.mean(task_presence_non_binarized) * np.ones_like(time[start_TR:end_TR]), + linewidth=4, + ) + # put vertical lines at the start of each TR + for TR in range(start_TR, end_TR): + plt.axvline(x=TR * TR_mri, color="r", linestyle="--") + # show TR labels on the red lines with a small font and at the top + for TR in range(start_TR, end_TR): + plt.text(TR * TR_mri, 1.2, f"TR {TR}", fontsize=8, color="black", ha="center") plt.title("Task presence") plt.xlabel("Time (s)") @@ -718,6 +753,9 @@ def create_html_report( # Generate report only one random subjects SUBJECTS = np.random.choice(SUBJECTS, 1) + start_time = 0 + end_time = 200 + for subj in SUBJECTS: for session in SESSIONS: for task in TASKS: @@ -728,8 +766,8 @@ def create_html_report( dFC_root=dFC_root, subj=subj, task=task, - start_time=50, - end_time=150, + start_time=start_time, + end_time=end_time, output_root=reports_root, run=run, session=session, @@ -742,8 +780,8 @@ def create_html_report( roi_root=roi_root, subj=subj, task=task, - start_time=50, - end_time=150, + start_time=start_time, + end_time=end_time, nodes_list=range(0, 10), output_root=reports_root, run=run, @@ -757,6 +795,8 @@ def create_html_report( roi_root=roi_root, subj=subj, task=task, + start_time=start_time, + end_time=end_time, output_root=reports_root, run=run, session=session, @@ -769,6 +809,8 @@ def create_html_report( roi_root=roi_root, subj=subj, task=task, + start_time=start_time, + end_time=end_time, output_root=reports_root, run=run, session=session, From 71b96b0a30889a7f91aba34047825c73d78e1f6f Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 21:35:14 -0400 Subject: [PATCH 062/401] change fig size report --- task_dFC/generate_report.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 77e316a..0e46f56 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -1,7 +1,6 @@ import argparse import json import os -from tracemalloc import start import matplotlib.pyplot as plt import numpy as np @@ -159,7 +158,7 @@ def plot_roi_signals( start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) for i in nodes_list: plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) @@ -215,7 +214,7 @@ def plot_event_labels( start_timepoint = int(start_time / TR_task) end_timepoint = int(end_time / TR_task) # keep the figure width proportional to the number of time points - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) plt.plot( time[start_timepoint:end_timepoint], @@ -283,7 +282,7 @@ def plot_task_presence( start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points in data - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) plt.plot( time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4 @@ -750,8 +749,9 @@ def create_html_report( print("Generating report...") - # Generate report only one random subjects - SUBJECTS = np.random.choice(SUBJECTS, 1) + # Generate report only 5 random subjects + # SUBJECTS = np.random.choice(SUBJECTS, 5) + SUBJECTS = SUBJECTS[:1] start_time = 0 end_time = 200 From e8997b7ca275cc08354f706d7becfd3b86f3c607 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 22:34:48 -0400 Subject: [PATCH 063/401] minor change --- task_dFC/generate_report.py | 11 +++++++---- task_dFC/run_scripts/run_dFC.sh | 2 +- task_dFC/run_scripts/run_fmriprep.sh | 2 +- task_dFC/run_scripts/run_nifti_to_roi.sh | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 0e46f56..3078f0f 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -158,7 +158,7 @@ def plot_roi_signals( start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points - fig_width = int(5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) for i in nodes_list: plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) @@ -214,7 +214,7 @@ def plot_event_labels( start_timepoint = int(start_time / TR_task) end_timepoint = int(end_time / TR_task) # keep the figure width proportional to the number of time points - fig_width = int(5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) plt.plot( time[start_timepoint:end_timepoint], @@ -282,7 +282,7 @@ def plot_task_presence( start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points in data - fig_width = int(5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / TR_mri) plt.figure(figsize=(fig_width, 5)) plt.plot( time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4 @@ -595,7 +595,7 @@ def create_html_report( This function creates an html report for the subject results using the generated figures. """ - img_height = 150 + # create html report subj_dir = f"{reports_root}/subject_results/{subj}" file = open(f"{subj_dir}/report.html", "w") @@ -620,6 +620,8 @@ def create_html_report( if run is not None: session_task_run_dir = f"{session_task_run_dir}/{run}" + img_height = 100 + # display ROI signals ROI_signals_img = ( f"{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png" @@ -660,6 +662,7 @@ def create_html_report( file.write("
\n") # display dFC matrices + img_height = 50 # for dFC matrices find all png files in the directory dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}" if os.path.exists(dFC_matrices_dir): diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts/run_dFC.sh index 0683935..684dbea 100644 --- a/task_dFC/run_scripts/run_dFC.sh +++ b/task_dFC/run_scripts/run_dFC.sh @@ -5,7 +5,7 @@ #$ -e logs/dfc_err.txt #$ -l h_vmem=32G #$ -q origami.q -#$ -t 1-200 +#$ -t 1-300 SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts/run_fmriprep.sh index 53dc89d..ea3c357 100644 --- a/task_dFC/run_scripts/run_fmriprep.sh +++ b/task_dFC/run_scripts/run_fmriprep.sh @@ -7,7 +7,7 @@ #$ -l h_vmem=32G #$ -q origami.q -#$ -t 1-122 +#$ -t 1-300 # TODO replace with local paths source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh" diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts/run_nifti_to_roi.sh index 5f10f08..9af79f7 100644 --- a/task_dFC/run_scripts/run_nifti_to_roi.sh +++ b/task_dFC/run_scripts/run_nifti_to_roi.sh @@ -5,7 +5,7 @@ #$ -e logs/roi_err.txt #$ -l h_vmem=32G #$ -q origami.q -#$ -t 1-200 +#$ -t 1-300 SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" From 067331656598acac569139d1a1c9f3dfb5e8d4df Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 21 Jun 2024 23:28:17 -0400 Subject: [PATCH 064/401] fix path in report --- task_dFC/generate_report.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 3078f0f..c53b4bd 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -630,6 +630,8 @@ def create_html_report( height, width, _ = img.shape # change the width so that height equals img_height width = int(width * img_height / height) + # replace the path to the image with a relative path + ROI_signals_img = ROI_signals_img.replace(subj_dir, ".") file.write( f"ROI signals\n" ) @@ -643,8 +645,10 @@ def create_html_report( height, width, _ = img.shape # change the width so that height equals img_height width = int(width * img_height / height) + # replace the path to the image with a relative path + event_labels_img = event_labels_img.replace(subj_dir, ".") file.write( - f"Event labels\n" + f"Event labels\n" ) file.write("
\n") @@ -656,26 +660,31 @@ def create_html_report( height, width, _ = img.shape # change the width so that height equals img_height width = int(width * img_height / height) + # replace the path to the image with a relative path + task_presence_img = task_presence_img.replace(subj_dir, ".") file.write( - f"Task presence\n" + f"Task presence\n" ) file.write("
\n") # display dFC matrices - img_height = 50 + img_height = 45 # for dFC matrices find all png files in the directory dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}" if os.path.exists(dFC_matrices_dir): for file_name in os.listdir(dFC_matrices_dir): if file_name.endswith(".png"): file.write(f"

{file_name[:file_name.find('_dFC')]}

\n") + dFC_matrices_img = f"{dFC_matrices_dir}/{file_name}" # get the original size of the image - img = plt.imread(f"{dFC_matrices_dir}/{file_name}") + img = plt.imread(dFC_matrices_img) height, width, _ = img.shape # change the width so that height equals img_height width = int(width * img_height / height) + # replace the path to the image with a relative path + dFC_matrices_img = dFC_matrices_img.replace(subj_dir, ".") file.write( - f"{file_name}\n" + f"{file_name}\n" ) file.write("
\n") file.write("\n") From ec9966bf3b5ae659b9af096760c68f7bcdc93ba8 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Sat, 22 Jun 2024 00:40:02 -0400 Subject: [PATCH 065/401] add task features to report --- task_dFC/generate_report.py | 240 +++++++++++++++++++++++++++++++++--- 1 file changed, 225 insertions(+), 15 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index c53b4bd..ecd6fa3 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -464,11 +464,13 @@ def plot_ML_results( dataframe = dataframe[dataframe["run"] == run] plt.figure(figsize=(10, 5)) + g = sns.pointplot( data=dataframe[dataframe["task"] == task], x="dFC method", y=f"{ML_algorithm} accuracy", hue="group", + hue_order=["train", "test"], errorbar="sd", linestyle="none", dodge=True, @@ -512,10 +514,6 @@ def plot_ML_results( plt.close() -def plot_task_presence_characteristics(): - pass - - def plot_clustering_results(): pass @@ -584,7 +582,76 @@ def plot_clustering_results(): # plt.show() -def create_html_report( +def plot_task_presence_features( + ML_root, + output_root, + session=None, + run=None, +): + """ + Plot the task presence features for a given session and run. + for comparability of tasks, pass the same run number for all tasks + parameters: + ---------- + ML_root: str, path to ML results + output_root: str, path to save the figures + session: str, session name + run: int, run number + """ + if session is None: + task_features = np.load( + f"{ML_root}/task_features.npy", allow_pickle="TRUE" + ).item() + else: + task_features = np.load( + f"{ML_root}/{session}/task_features.npy", allow_pickle="TRUE" + ).item() + + sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) + + sns.set_style("darkgrid") + + dataframe = pd.DataFrame(task_features) + if run is not None: + dataframe = dataframe[dataframe["run"] == run] + + # FEATURES are columns in the dataframe except for 'task' and 'run' + FEATURES = list(dataframe.columns) + FEATURES.remove("task") + FEATURES.remove("run") + + for i, feature in enumerate(FEATURES): + plt.figure(figsize=(10, 5)) + sns.pointplot( + data=dataframe, + x="task", + y=feature, + errorbar="sd", + linestyle="none", + dodge=True, + capsize=0.1, + ) + # save the figure + if session is None: + output_dir = f"{output_root}/group_results/task_presence_features" + else: + output_dir = f"{output_root}/group_results/task_presence_features/{session}" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + plt.savefig( + f"{output_dir}/task_presence_features_{feature}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() + + +def create_html_report_subj_results( subj, SESSIONS, TASKS, @@ -692,6 +759,111 @@ def create_html_report( file.close() +def create_html_report_group_results( + SESSIONS, + TASKS, + RUNS, + reports_root, +): + """ + This function creates an html report for the group results + using the generated figures. + """ + # create html report + group_dir = f"{reports_root}/group_results" + file = open(f"{group_dir}/report.html", "w") + file.write("\n") + file.write("\n") + file.write("Group Results\n") + file.write("\n") + file.write("\n") + file.write("

Group Results

\n") + + # task presence features + img_height = 300 + file.write("

Task presence features

\n") + for session in SESSIONS: + if session is not None: + file.write(f"

{session}

\n") + # display task presence features + if session is not None: + task_presence_features_dir = f"{group_dir}/task_presence_features/{session}" + else: + task_presence_features_dir = f"{group_dir}/task_presence_features" + # find all png files in the directory + for file_name in os.listdir(task_presence_features_dir): + if file_name.endswith(".png"): + file.write(f"

{file_name[:file_name.find('_task')]}

\n") + task_presence_features_img = f"{task_presence_features_dir}/{file_name}" + # get the original size of the image + img = plt.imread(task_presence_features_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + task_presence_features_img = task_presence_features_img.replace( + group_dir, "." + ) + file.write( + f"Task presence features\n" + ) + + # classification results + img_height = 300 + file.write("

Classification results

\n") + for session in SESSIONS: + if session is not None: + file.write(f"

{session}

\n") + for task in TASKS: + file.write(f"

{task}

\n") + for run in RUNS[task]: + # if run is not None: + # file.write(f"

{run}

\n") + if session is not None: + classification_dir = f"{group_dir}/classification/{session}" + else: + classification_dir = f"{group_dir}/classification" + + # display KNN classification results + if run is None: + classification_img = ( + f"{classification_dir}/ML_results_classify_KNN_{task}.png" + ) + else: + classification_img = ( + f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png" + ) + img = plt.imread(classification_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + classification_img = classification_img.replace(group_dir, ".") + file.write( + f"Classification results\n" + ) + + # # display Logistic regression classification results + # if run is None: + # classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}.png" + # else: + # classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png" + # img = plt.imread(classification_img) + # height, width, _ = img.shape + # # change the width so that height equals img_height + # width = int(width * img_height / height) + # # replace the path to the image with a relative path + # classification_img = classification_img.replace(group_dir, ".") + # file.write( + # f"Classification results\n" + # ) + + file.write("
\n") + file.write("\n") + file.write("\n") + file.close() + + ####################################################################################### if __name__ == "__main__": # argparse @@ -831,7 +1003,7 @@ def create_html_report( print(f"Error in plotting task presence: {e}") # create html report try: - create_html_report( + create_html_report_subj_results( subj=subj, SESSIONS=SESSIONS, TASKS=TASKS, @@ -839,19 +1011,46 @@ def create_html_report( reports_root=reports_root, ) except Exception as e: - print(f"Error in creating html report: {e}") + print(f"Error in creating html report for subject results: {e}") + + # find the common run number for all tasks for task presence features + common_run = None + for task in TASKS: + if common_run is None: + common_run = RUNS[task][0] + else: + if RUNS[task][0] != common_run: + common_run = None + # raise warning + print( + "Warning: Tasks have different run numbers for task presence features!" + ) + break for session in SESSIONS: + try: + plot_task_presence_features( + ML_root=ML_root, + output_root=reports_root, + session=session, + run=common_run, + ) + except Exception as e: + print(f"Error in plotting task presence features: {e}") + for task in TASKS: for run in RUNS[task]: - plot_ML_results( - ML_root=ML_root, - output_root=reports_root, - task=task, - run=run, - session=session, - ML_algorithm="KNN", - ) + try: + plot_ML_results( + ML_root=ML_root, + output_root=reports_root, + task=task, + run=run, + session=session, + ML_algorithm="KNN", + ) + except Exception as e: + print(f"Error in plotting ML results for KNN: {e}") # plot_ML_results( # ML_root=ML_root, # output_root=reports_root, @@ -861,6 +1060,17 @@ def create_html_report( # ML_algorithm="Logistic regression", # ) + # create html report + try: + create_html_report_group_results( + SESSIONS=SESSIONS, + TASKS=TASKS, + RUNS=RUNS, + reports_root=reports_root, + ) + except Exception as e: + print(f"Error in creating html report for group results: {e}") + print("Report generated successfully!") ####################################################################################### From e280de1fa630c7d2bc5bfb3541f691e14f1b8f5c Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 24 Jun 2024 12:19:41 -0400 Subject: [PATCH 066/401] minor change --- task_dFC/ML.py | 3 --- task_dFC/generate_report.py | 5 ++--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 3ff4dec..b81f3a2 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -1,7 +1,6 @@ import argparse import json import os -from re import S import numpy as np from sklearn.cluster import KMeans @@ -382,7 +381,6 @@ def task_presence_classification( SUBJECTS = find_available_subjects( dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id ) - SUBJECTS = SUBJECTS[:20] # randomly select train_test_ratio of the subjects for training # and rest for testing using numpy.random.choice @@ -539,7 +537,6 @@ def task_presence_clustering( SUBJECTS = find_available_subjects( dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id ) - SUBJECTS = SUBJECTS[:20] print(f"Number of subjects: {len(SUBJECTS)}") diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index ecd6fa3..1310745 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -781,7 +781,7 @@ def create_html_report_group_results( # task presence features img_height = 300 - file.write("

Task presence features

\n") + file.write("

Task Presence Features

\n") for session in SESSIONS: if session is not None: file.write(f"

{session}

\n") @@ -793,7 +793,6 @@ def create_html_report_group_results( # find all png files in the directory for file_name in os.listdir(task_presence_features_dir): if file_name.endswith(".png"): - file.write(f"

{file_name[:file_name.find('_task')]}

\n") task_presence_features_img = f"{task_presence_features_dir}/{file_name}" # get the original size of the image img = plt.imread(task_presence_features_img) @@ -810,7 +809,7 @@ def create_html_report_group_results( # classification results img_height = 300 - file.write("

Classification results

\n") + file.write("

Classification Results

\n") for session in SESSIONS: if session is not None: file.write(f"

{session}

\n") From dcbce16c9573a2012c565c351f495c07c195a932 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 24 Jun 2024 16:12:44 -0400 Subject: [PATCH 067/401] remove -t from run scripts --- task_dFC/run_scripts/run_FCS.sh | 1 - task_dFC/run_scripts/run_dFC.sh | 1 - task_dFC/run_scripts/run_fmriprep.sh | 2 -- task_dFC/run_scripts/run_nifti_to_roi.sh | 1 - 4 files changed, 5 deletions(-) diff --git a/task_dFC/run_scripts/run_FCS.sh b/task_dFC/run_scripts/run_FCS.sh index fb22ed5..a84c578 100644 --- a/task_dFC/run_scripts/run_FCS.sh +++ b/task_dFC/run_scripts/run_FCS.sh @@ -5,7 +5,6 @@ #$ -e logs/fcs_err.txt #$ -l h_vmem=64G #$ -q origami.q -#$ -t 1-10 DATASET_INFO="./dataset_info.json" METHODS_CONFIG="./methods_config.json" diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts/run_dFC.sh index 684dbea..124dc1f 100644 --- a/task_dFC/run_scripts/run_dFC.sh +++ b/task_dFC/run_scripts/run_dFC.sh @@ -5,7 +5,6 @@ #$ -e logs/dfc_err.txt #$ -l h_vmem=32G #$ -q origami.q -#$ -t 1-300 SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts/run_fmriprep.sh index ea3c357..ada2813 100644 --- a/task_dFC/run_scripts/run_fmriprep.sh +++ b/task_dFC/run_scripts/run_fmriprep.sh @@ -7,8 +7,6 @@ #$ -l h_vmem=32G #$ -q origami.q -#$ -t 1-300 - # TODO replace with local paths source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh" conda activate nipoppy_env diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts/run_nifti_to_roi.sh index 9af79f7..1fff1da 100644 --- a/task_dFC/run_scripts/run_nifti_to_roi.sh +++ b/task_dFC/run_scripts/run_nifti_to_roi.sh @@ -5,7 +5,6 @@ #$ -e logs/roi_err.txt #$ -l h_vmem=32G #$ -q origami.q -#$ -t 1-300 SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" From b5a3dfbacfb9975cc4555247ae997d582754aed5 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 2 Jul 2024 12:57:57 -0400 Subject: [PATCH 068/401] add dFC clustering to report --- task_dFC/generate_report.py | 366 +++++++++++++++++++++++++++--------- 1 file changed, 276 insertions(+), 90 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 1310745..7fd231c 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -514,72 +514,168 @@ def plot_ML_results( plt.close() -def plot_clustering_results(): - pass +def plot_clustering_results(ML_root, output_root, task, run=None, session=None): + """ + Plot the clustering results for a given task, run and session. + parameters: + ---------- + ML_root: str, path to ML results + output_root: str, path to save the figures + task: str, task name + run: int, run number + session: str, session name + """ + if session is None: + clustering_scores = np.load( + f"{ML_root}/clustering_scores.npy", allow_pickle="TRUE" + ).item() + else: + clustering_scores = np.load( + f"{ML_root}/{session}/clustering_scores.npy", allow_pickle="TRUE" + ).item() + sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) -# def plot_dFC_clustering( -# dFC_root, -# subj, -# task, -# start_time, -# end_time, -# run=None, -# session=None, -# normalize_dFC=True, -# ): -# task_data = load_task_data(roi_root, subj, task, run, session) -# TR_mri = task_data['TR_mri'] - -# dFC_lst = list() -# for dFC_id in range(0, 20): # change this to the number of dFCs you have -# try: -# dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session) -# dFC_lst.append(dFC) -# except Exception: -# pass - -# for dFC in dFC_lst: -# dFC_mat = dFC.get_dFC_mat() -# TR_array = dFC.TR_array -# if normalize_dFC: -# dFC_mat = rank_norm(dFC_mat) -# dFC_vecs = dFC_mat2vec(dFC_mat) - -# # apply kmeans clustering with PCA to dFC vectors -# n_clusters = 2 - -# scaler = StandardScaler() -# dFC_vecs = scaler.fit_transform(dFC_vecs) -# # PCA -# # find number of components that explain 95% of variance -# pca = PCA() -# pca.fit(dFC_vecs) -# n_components = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1 -# # print(f"Number of components: {n_components}") -# pca = PCA(n_components=n_components) -# pca.fit(dFC_vecs) - - -# dFC_vecs_pca = pca.transform(dFC_vecs) -# kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=20) -# labels_pred = kmeans.fit_predict(dFC_vecs_pca) - -# start_TR = int(start_time/TR_mri) -# end_TR = int(end_time/TR_mri) -# start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0] -# end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] - -# # plot labels_pred -# plt.figure(figsize=(35, 2)) -# plt.plot(time[start_TR:end_TR], labels_pred[start_TR_idx:end_TR_idx], linewidth=4) -# # put vertical lines at the start of each TR -# for TR in chosen_TRs: -# plt.axvline(x=TR*TR_mri, color='r', linestyle='--') -# # plt.text(TR*TR_mri, 0.5, f"TR {TR}", fontsize=8, color='black', ha='center') -# plt.title(f"Cluster labels of {dFC.measure.measure_name}") -# plt.xlabel('Time (s)') -# plt.show() + sns.set_style("darkgrid") + + dataframe = pd.DataFrame(clustering_scores) + if run is not None: + dataframe = dataframe[dataframe["run"] == run] + + plt.figure(figsize=(10, 5)) + g = sns.pointplot( + data=dataframe[dataframe["task"] == task], + x="dFC method", + y="Kmeans ARI", + errorbar="sd", + linestyle="none", + dodge=True, + capsize=0.1, + ) + g.axhline(0.0, color="r", linestyle="--") + if show_title: + g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) + + # save the figure + if session is None: + output_dir = f"{output_root}/group_results/clustering" + else: + output_dir = f"{output_root}/group_results/clustering/{session}" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if run is None: + plt.savefig( + f"{output_dir}/clustering_results_{task}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + else: + plt.savefig( + f"{output_dir}/clustering_results_{task}_{run}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() + + +def plot_dFC_clustering( + dFC_root, + subj, + task, + start_time, + end_time, + output_root, + run=None, + session=None, + normalize_dFC=True, +): + task_data = load_task_data(roi_root, subj, task, run, session) + TR_mri = task_data["TR_mri"] + + for dFC_id in range( + 0, 20 + ): # change this to the number of dFCs you have or right a function that finds available dFC ids + try: + dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session) + except Exception: + pass + + dFC_mat = dFC.get_dFC_mat() + TR_array = dFC.TR_array + if normalize_dFC: + dFC_mat = rank_norm(dFC_mat) + dFC_vecs = dFC_mat2vec(dFC_mat) + + if session is None: + clustering_RESULTS = np.load( + f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + clustering_RESULTS = np.load( + f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + + if run is None: + scaler = clustering_RESULTS[task]["StandardScaler"] + pca = clustering_RESULTS[task]["PCA"] + kmeans = clustering_RESULTS[task]["kmeans"] + else: + scaler = clustering_RESULTS[task][run]["StandardScaler"] + pca = clustering_RESULTS[task][run]["PCA"] + kmeans = clustering_RESULTS[task][run]["kmeans"] + + dFC_vecs_normalized = scaler.transform(dFC_vecs) + dFC_vecs_pca = pca.transform(dFC_vecs_normalized) + cluster_labels = kmeans.predict(dFC_vecs_pca) + + start_TR = int(start_time / TR_mri) + end_TR = int(end_time / TR_mri) + + start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0] + end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] + + fig_width = int(2.5 * (end_time - start_time) / TR_mri) + plt.figure(figsize=(fig_width, 5)) + time = TR_array[start_TR_idx:end_TR_idx] * TR_mri + plt.plot( + time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4 + ) + # put vertical lines at the start of each TR + for t in time: + plt.axvline(x=t, color="r", linestyle="--") + # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center') + plt.title(f"Cluster labels of {dFC.measure.measure_name}") + plt.xlabel("Time (s)") + + # save the figure + output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering" + if session is not None: + output_dir = f"{output_dir}/{session}" + output_dir = f"{output_dir}/{task}" + if run is not None: + output_dir = f"{output_dir}/{run}" + output_dir = f"{output_dir}/" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + plt.savefig( + f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() def plot_task_presence_features( @@ -754,6 +850,29 @@ def create_html_report_subj_results( f"{file_name}\n" ) file.write("
\n") + + # display dFC clustering + img_height = 100 + # for dFC matrices find all png files in the directory + dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}" + if os.path.exists(dFC_clustering_dir): + for file_name in os.listdir(dFC_clustering_dir): + if file_name.endswith(".png"): + file.write( + f"

{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}

\n" + ) + dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}" + # get the original size of the image + img = plt.imread(dFC_clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".") + file.write( + f"{file_name}\n" + ) + file.write("
\n") file.write("\n") file.write("\n") file.close() @@ -816,14 +935,15 @@ def create_html_report_group_results( for task in TASKS: file.write(f"

{task}

\n") for run in RUNS[task]: - # if run is not None: - # file.write(f"

{run}

\n") + if run is not None: + file.write(f"

{run}

\n") if session is not None: classification_dir = f"{group_dir}/classification/{session}" else: classification_dir = f"{group_dir}/classification" # display KNN classification results + file.write("

KNN

\n") if run is None: classification_img = ( f"{classification_dir}/ML_results_classify_KNN_{task}.png" @@ -842,22 +962,60 @@ def create_html_report_group_results( f"Classification results\n" ) - # # display Logistic regression classification results - # if run is None: - # classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}.png" - # else: - # classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png" - # img = plt.imread(classification_img) - # height, width, _ = img.shape - # # change the width so that height equals img_height - # width = int(width * img_height / height) - # # replace the path to the image with a relative path - # classification_img = classification_img.replace(group_dir, ".") - # file.write( - # f"Classification results\n" - # ) - - file.write("
\n") + # display Logistic regression classification results + file.write("

Logistic Regression

\n") + if run is None: + classification_img = ( + f"{classification_dir}/ML_results_classify_LogReg_{task}.png" + ) + else: + classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png" + img = plt.imread(classification_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + classification_img = classification_img.replace(group_dir, ".") + file.write( + f"Classification results\n" + ) + + file.write("
\n") + + # clustering results + img_height = 300 + file.write("

Clustering Results

\n") + for session in SESSIONS: + if session is not None: + file.write(f"

{session}

\n") + for task in TASKS: + file.write(f"

{task}

\n") + for run in RUNS[task]: + if run is not None: + file.write(f"

{run}

\n") + if session is not None: + clustering_dir = f"{group_dir}/clustering/{session}" + else: + clustering_dir = f"{group_dir}/clustering" + + # display clustering results + if run is None: + clustering_img = f"{clustering_dir}/clustering_results_{task}.png" + else: + clustering_img = ( + f"{clustering_dir}/clustering_results_{task}_{run}.png" + ) + img = plt.imread(clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + clustering_img = clustering_img.replace(group_dir, ".") + file.write( + f"Clustering results\n" + ) + + file.write("
\n") file.write("\n") file.write("\n") file.close() @@ -1000,6 +1158,21 @@ def create_html_report_group_results( ) except Exception as e: print(f"Error in plotting task presence: {e}") + + try: + plot_dFC_clustering( + dFC_root=dFC_root, + subj=subj, + task=task, + start_time=start_time, + end_time=end_time, + output_root=reports_root, + run=run, + session=session, + normalize_dFC=True, + ) + except Exception as e: + print(f"Error in plotting dFC clustering: {e}") # create html report try: create_html_report_subj_results( @@ -1050,14 +1223,27 @@ def create_html_report_group_results( ) except Exception as e: print(f"Error in plotting ML results for KNN: {e}") - # plot_ML_results( - # ML_root=ML_root, - # output_root=reports_root, - # task=task, - # run=run, - # session=session, - # ML_algorithm="Logistic regression", - # ) + try: + plot_ML_results( + ML_root=ML_root, + output_root=reports_root, + task=task, + run=run, + session=session, + ML_algorithm="Logistic regression", + ) + except Exception as e: + print(f"Error in plotting ML results for Logistic regression: {e}") + try: + plot_clustering_results( + ML_root=ML_root, + output_root=reports_root, + task=task, + run=run, + session=session, + ) + except Exception as e: + print(f"Error in plotting clustering results: {e}") # create html report try: From 9e3d93ea1c74a47719a75154b0113ddde489a589 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 2 Jul 2024 13:47:15 -0400 Subject: [PATCH 069/401] minor change --- task_dFC/generate_report.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 7fd231c..1b0bbb5 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -159,6 +159,7 @@ def plot_roi_signals( end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) for i in nodes_list: plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4) @@ -215,6 +216,7 @@ def plot_event_labels( end_timepoint = int(end_time / TR_task) # keep the figure width proportional to the number of time points fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) plt.plot( time[start_timepoint:end_timepoint], @@ -283,6 +285,7 @@ def plot_task_presence( end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points in data fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) plt.plot( time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4 @@ -643,6 +646,7 @@ def plot_dFC_clustering( end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) time = TR_array[start_TR_idx:end_TR_idx] * TR_mri plt.plot( From ef88cd6a6b7c986a82347642aea894fa5fa3e7c1 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 8 Jul 2024 13:49:30 -0400 Subject: [PATCH 070/401] add random forest to ML --- task_dFC/ML.py | 226 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 158 insertions(+), 68 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index b81f3a2..f288d9f 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -5,9 +5,10 @@ import numpy as np from sklearn.cluster import KMeans from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler @@ -354,6 +355,133 @@ def dFC_feature_extraction( ) +def logistic_regression_classify(X_train, y_train, X_test, y_test): + """ + Logistic regression classification + """ + # create a pipeline with a logistic regression model to find the best C + logistic_reg = make_pipeline(StandardScaler(), LogisticRegression()) + # create a dictionary of all values we want to test for C + param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} + # use gridsearch to test all values for C + lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5) + # fit model to data + lr_gscv.fit(X_train, y_train) + + C = lr_gscv.best_params_["logisticregression__C"] + + log_reg = make_pipeline( + StandardScaler(), + LogisticRegression(C=C), + ).fit(X_train, y_train) + + RESULT = { + "log_reg_model": log_reg, + "log_reg_C": C, + "log_reg_train_score": log_reg.score(X_train, y_train), + "log_reg_test_score": log_reg.score(X_test, y_test), + } + + return RESULT + + +def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95): + """ + KNN classification + """ + # find num_PCs + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_train) + num_PCs = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + + 1 + ) + + # create a pipeline with a knn model to find the best n_neighbors + knn = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(), + ) + # create a dictionary of all values we want to test for n_neighbors + param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} + # use gridsearch to test all values for n_neighbors + knn_gscv = GridSearchCV(knn, param_grid, cv=5) + # fit model to data + knn_gscv.fit(X_train, y_train) + + n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] + + neigh = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + KNeighborsClassifier(n_neighbors=n_neighbors), + ).fit(X_train, y_train) + + RESULT = { + "KNN_pca": pca, + "KNN_num_PCs": num_PCs, + "KNN_cv_results": knn_gscv.cv_results_, + "KNN_model": neigh, + "KNN_train_score": neigh.score(X_train, y_train), + "KNN_test_score": neigh.score(X_test, y_test), + } + + return RESULT + + +def random_forest_classify( + X_train, y_train, X_test, y_test, explained_var_threshold=0.95 +): + """ + Random Forest classification + """ + # find num_PCs + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_train) + num_PCs = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + + 1 + ) + num_PCs = min(num_PCs, 100) + + # create a pipeline with a random forest model to find the best n_estimators + rf = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + RandomForestClassifier(), + ) + # create a dictionary of all values we want to test for n_estimators + param_grid = { + "randomforestclassifier__n_estimators": [10, 50, 100, 200], + "randomforestclassifier__max_depth": [None, 5, 10, 15, 20, 30], + } + # use gridsearch to test all values for n_estimators + rf_gscv = GridSearchCV(rf, param_grid, cv=5) + # fit model to data + rf_gscv.fit(X_train, y_train) + + n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"] + max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"] + + rf = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs), + RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth), + ).fit(X_train, y_train) + + RESULT = { + "RF_pca": pca, + "RF_num_PCs": num_PCs, + "RF_cv_results": rf_gscv.cv_results_, + "RF_model": rf, + "RF_train_score": rf.score(X_train, y_train), + "RF_test_score": rf.score(X_test, y_test), + } + + return RESULT + + def task_presence_classification( task, dFC_id, @@ -367,7 +495,7 @@ def task_presence_classification( explained_var_threshold=0.95, ): """ - perform task presence classification using logistic regression and KNN + perform task presence classification using logistic regression, KNN, or Random Forest for a given task and dFC method and run. """ if run is None: @@ -412,72 +540,25 @@ def task_presence_classification( print("task presence classification ...") # logistic regression - logistic_reg = make_pipeline(StandardScaler(), LogisticRegression()) - # create a dictionary of all values we want to test for C - param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} - # use gridsearch to test all values for C - lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5) - # fit model to data - lr_gscv.fit(X_train, y_train) - - C = lr_gscv.best_params_["logisticregression__C"] + log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test) - log_reg = make_pipeline( - StandardScaler(), - LogisticRegression(C=C), - ).fit(X_train, y_train) + # # KNN + # KNN_RESULT = KNN_classify( + # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold + # ) - # KNN - # find num_PCs - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_train) - num_PCs = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] - + 1 - ) - - # create a pipeline with a knn model to find the best n_neighbors - knn = make_pipeline( - StandardScaler(), - PCA(n_components=num_PCs), - KNeighborsClassifier(), + # Random Forest + RF_RESULT = random_forest_classify( + X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold ) - # create a dictionary of all values we want to test for n_neighbors - param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} - # use gridsearch to test all values for n_neighbors - knn_gscv = GridSearchCV(knn, param_grid, cv=5) - # fit model to data - knn_gscv.fit(X_train, y_train) - n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] - - neigh = make_pipeline( - StandardScaler(), - PCA(n_components=num_PCs), - KNeighborsClassifier(n_neighbors=n_neighbors), - ).fit(X_train, y_train) - - ML_RESULT = { - "logistic regression": log_reg, - "logistic regression C": C, - "logistic regression train score": log_reg.score(X_train, y_train), - "logistic regression test score": log_reg.score(X_test, y_test), - "pca": pca, - "num_PCs": num_PCs, - "cv_results": knn_gscv.cv_results_, - "KNN": neigh, - "KNN train score": neigh.score(X_train, y_train), - "KNN test score": neigh.score(X_test, y_test), - } - - print( - f"Logistic regression train score {measure_name} {task}: {log_reg.score(X_train, y_train)}" - ) - print( - f"Logistic regression test score {measure_name} {task}: {log_reg.score(X_test, y_test)}" - ) - print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}") - print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}") + ML_RESULT = {} + for key in log_reg_RESULT: + ML_RESULT[key] = log_reg_RESULT[key] + # for key in KNN_RESULT: + # ML_RESULT[key] = KNN_RESULT[key] + for key in RF_RESULT: + ML_RESULT[key] = RF_RESULT[key] # measure pred score on each subj @@ -488,8 +569,12 @@ def task_presence_classification( "run": list(), "dFC method": list(), "Logistic regression accuracy": list(), - "KNN accuracy": list(), + # "KNN accuracy": list(), } + log_reg = log_reg_RESULT["log_reg_model"] + # KNN = KNN_RESULT["KNN_model"] + RF = RF_RESULT["RF_model"] + for subj in SUBJECTS: ML_scores["subj_id"].append(subj) if subj in train_subjects: @@ -502,12 +587,16 @@ def task_presence_classification( target = y_test[subj_label_test == subj] pred_lr = log_reg.predict(features) - pred_KNN = neigh.predict(features) + # pred_KNN = KNN.predict(features) + pred_RF = RF.predict(features) ML_scores["Logistic regression accuracy"].append( balanced_accuracy_score(target, pred_lr) ) - ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) + # ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) + ML_scores["Random Forest accuracy"].append( + balanced_accuracy_score(target, pred_RF) + ) ML_scores["task"].append(task) ML_scores["run"].append(run) @@ -637,7 +726,8 @@ def run_classification( "run": list(), "dFC method": list(), "Logistic regression accuracy": list(), - "KNN accuracy": list(), + # "KNN accuracy": list(), + "Random Forest accuracy": list(), } for dFC_id in range(0, 7): print(f"=================== dFC {dFC_id} ===================") From 571e8cd564944d41f6d820ecb6a35c9a1e202968 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 8 Jul 2024 18:36:07 -0400 Subject: [PATCH 071/401] paralel dFC_ids in ML --- task_dFC/ML.py | 183 +++++++++++++++++++++++++------------------------ 1 file changed, 95 insertions(+), 88 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index f288d9f..db2489e 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -707,6 +707,7 @@ def task_presence_clustering( def run_classification( + dFC_id, TASKS, RUNS, SESSIONS, @@ -729,42 +730,41 @@ def run_classification( # "KNN accuracy": list(), "Random Forest accuracy": list(), } - for dFC_id in range(0, 7): - print(f"=================== dFC {dFC_id} ===================") - - ML_RESULT = {} - for task_id, task in enumerate(TASKS): - ML_RESULT[task] = {} - for run in RUNS[task]: - ML_RESULT_new, ML_scores_new = task_presence_classification( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - if run is None: - ML_RESULT[task] = ML_RESULT_new - else: - ML_RESULT[task][run] = ML_RESULT_new - for key in ML_scores: - ML_scores[key].extend(ML_scores_new[key]) - if session is None: - folder = f"{output_root}" - else: - folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) + ML_RESULT = {} + for task_id, task in enumerate(TASKS): + ML_RESULT[task] = {} + for run in RUNS[task]: + ML_RESULT_new, ML_scores_new = task_presence_classification( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + if run is None: + ML_RESULT[task] = ML_RESULT_new + else: + ML_RESULT[task][run] = ML_RESULT_new + for key in ML_scores: + ML_scores[key].extend(ML_scores_new[key]) - np.save(f"{folder}/ML_scores_classify.npy", ML_scores) + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) + + np.save(f"{folder}/ML_scores_classify_{dFC_id}.npy", ML_scores) def run_clustering( + dFC_id, TASKS, RUNS, SESSIONS, @@ -783,40 +783,36 @@ def run_clustering( "dFC method": list(), "Kmeans ARI": list(), } - for dFC_id in range(0, 7): - print(f"=================== dFC {dFC_id} ===================") - - clustering_RESULTS = {} - for task_id, task in enumerate(TASKS): - clustering_RESULTS[task] = {} - for run in RUNS[task]: - clustering_RESULTS_new, clustering_scores_new = ( - task_presence_clustering( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - normalize_dFC=normalize_dFC, - ) - ) - if run is None: - clustering_RESULTS[task] = clustering_RESULTS_new - else: - clustering_RESULTS[task][run] = clustering_RESULTS_new - for key in clustering_scores: - clustering_scores[key].extend(clustering_scores_new[key]) - if session is None: - folder = f"{output_root}" - else: - folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS) + clustering_RESULTS = {} + for task_id, task in enumerate(TASKS): + clustering_RESULTS[task] = {} + for run in RUNS[task]: + clustering_RESULTS_new, clustering_scores_new = task_presence_clustering( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + normalize_dFC=normalize_dFC, + ) + if run is None: + clustering_RESULTS[task] = clustering_RESULTS_new + else: + clustering_RESULTS[task][run] = clustering_RESULTS_new + for key in clustering_scores: + clustering_scores[key].extend(clustering_scores_new[key]) + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS) - np.save(f"{folder}/clustering_scores.npy", clustering_scores) + np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores) ####################################################################################### @@ -885,32 +881,43 @@ def run_clustering( roi_root=roi_root, output_root=ML_root, ) - print("Task features extraction finished.") - print("Task presence classification started ...") - run_classification( - TASKS=TASKS, - RUNS=RUNS, - SESSIONS=SESSIONS, - roi_root=roi_root, - dFC_root=dFC_root, - output_root=ML_root, - dynamic_pred="no", - normalize_dFC=True, - ) - print("Task presence classification finished.") - print("Task presence clustering started ...") - run_clustering( - TASKS=TASKS, - RUNS=RUNS, - SESSIONS=SESSIONS, - roi_root=roi_root, - dFC_root=dFC_root, - output_root=ML_root, - normalize_dFC=True, - ) - print("Task presence clustering finished.") - print("Task presence prediction CODE finished running.") + job_id = int(os.getenv("SGE_TASK_ID")) + dFC_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 + + print(f"Task presence classification started for dFC ID {dFC_id}...") + try: + run_classification( + dFC_id=dFC_id, + TASKS=TASKS, + RUNS=RUNS, + SESSIONS=SESSIONS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + dynamic_pred="no", + normalize_dFC=True, + ) + except Exception as e: + print(f"Error in classification for dFC ID {dFC_id}: {e}") + print(f"Task presence classification finished for dFC ID {dFC_id}.") + print(f"Task presence clustering started for dFC ID {dFC_id} ...") + try: + run_clustering( + dFC_id=dFC_id, + TASKS=TASKS, + RUNS=RUNS, + SESSIONS=SESSIONS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + normalize_dFC=True, + ) + except Exception as e: + print(f"Error in clustering for dFC ID {dFC_id}: {e}") + + print(f"Task presence clustering finished for dFC ID {dFC_id}.") + print(f"Task presence prediction finished for dFC ID {dFC_id}.") ####################################################################################### From abf27ed53e508c878cfe74314901a7ea6a50c7db Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 8 Jul 2024 19:28:59 -0400 Subject: [PATCH 072/401] minor change --- task_dFC/ML.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index db2489e..080efb1 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -454,7 +454,7 @@ def random_forest_classify( # create a dictionary of all values we want to test for n_estimators param_grid = { "randomforestclassifier__n_estimators": [10, 50, 100, 200], - "randomforestclassifier__max_depth": [None, 5, 10, 15, 20, 30], + "randomforestclassifier__max_depth": [None, 5, 10, 20, 30], } # use gridsearch to test all values for n_estimators rf_gscv = GridSearchCV(rf, param_grid, cv=5) From e530e07bcaa7cfefc5ab30260a92c586609e5475 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 8 Jul 2024 20:04:00 -0400 Subject: [PATCH 073/401] add new tasks to simul --- pydfc/simul_utils.py | 54 +++++++++++++++++++++++++++++++- simul_dFC/task_data_simulator.py | 33 +++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py index b4e155f..fa4631e 100644 --- a/pydfc/simul_utils.py +++ b/pydfc/simul_utils.py @@ -60,6 +60,7 @@ def simulate_task_BOLD( sim_length, BOLD_period, TAVG_period, + num_stimulated_regions=5, global_conn_coupling_coef=0.0126, D=0.001, conn_speed=1.0, @@ -68,6 +69,27 @@ def simulate_task_BOLD( ): """ Simulate BOLD signal for a task. + + Parameters + ---------- + onset_time : float + The onset time of the task. + task_duration : float + The duration of the task. + task_block_duration : float + The duration of the task block. + sim_length : float + The length of the simulation. + BOLD_period : float + The BOLD period. + TAVG_period : float + The TAVG period. + num_stimulated_regions : int, optional + The number of stimulated regions. The default is 5. + if num_stimulated_regions is 5, the stimulated regions are: + [0, 7, 13, 33, 42] + if num_stimulated_regions is 15, the stimulated regions are: + [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70] """ # randomize some parameters for each subjects onset = np.random.normal(loc=onset_time, scale=0.5) # seconds @@ -78,8 +100,33 @@ def simulate_task_BOLD( conn.speed = np.array([conn_speed_rand]) # configure stimulus spatial pattern + if num_stimulated_regions == 5: + stimulated_regions_list = [0, 7, 13, 33, 42] + elif num_stimulated_regions == 15: + stimulated_regions_list = [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + ] + else: + stimulated_regions_list = np.random.choice( + np.arange(76), num_stimulated_regions, replace=False + ) + stimulated_regions_list = list(stimulated_regions_list) weighting = create_random_stimulus_weights( - stimulated_regions_list=[0, 7, 13, 33, 42], n_regions=76 + stimulated_regions_list=stimulated_regions_list, n_regions=76 ) stimulus = create_stimulus( @@ -227,6 +274,7 @@ def simulate_task_BOLD_TS( sim_length, BOLD_period, TAVG_period, + num_stimulated_regions=5, global_conn_coupling_coef=0.0126, D=0.001, conn_speed=1.0, @@ -244,6 +292,7 @@ def simulate_task_BOLD_TS( sim_length=sim_length, BOLD_period=BOLD_period, TAVG_period=TAVG_period, + num_stimulated_regions=num_stimulated_regions, global_conn_coupling_coef=global_conn_coupling_coef, D=D, conn_speed=conn_speed, @@ -298,6 +347,8 @@ def simulate_task_data(subj_id, task_info): The BOLD period. - TAVG_period: float The TAVG period. + - num_stimulated_regions: int + The number of stimulated regions. - global_conn_coupling_coef: float The global connectivity coupling coefficient. - D: float @@ -316,6 +367,7 @@ def simulate_task_data(subj_id, task_info): sim_length=task_info["sim_length"], BOLD_period=task_info["BOLD_period"], TAVG_period=task_info["TAVG_period"], + num_stimulated_regions=task_info["num_stimulated_regions"], global_conn_coupling_coef=task_info["global_conn_coupling_coef"], D=task_info["D"], conn_speed=task_info["conn_speed"], diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 24aa92a..912bfe1 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -53,6 +53,7 @@ "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, + "num_stimulated_regions": 5, "global_conn_coupling_coef": global_conn_coupling_coef, "D": D, "conn_speed": conn_speed, @@ -66,6 +67,21 @@ "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, + "num_stimulated_regions": 5, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-lowFreqShortTask": { + "task_name": "task-lowFreqShortTask", + "onset_time": onset_time, + "task_duration": 1.0, + "task_block_duration": 20.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "num_stimulated_regions": 5, "global_conn_coupling_coef": global_conn_coupling_coef, "D": D, "conn_speed": conn_speed, @@ -79,6 +95,7 @@ "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, + "num_stimulated_regions": 5, "global_conn_coupling_coef": global_conn_coupling_coef, "D": D, "conn_speed": conn_speed, @@ -92,6 +109,21 @@ "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, + "num_stimulated_regions": 5, + "global_conn_coupling_coef": global_conn_coupling_coef, + "D": D, + "conn_speed": conn_speed, + "dt": dt, + }, + "task-lowFreqShortRestDominStimul": { + "task_name": "task-lowFreqShortRestDominStimul", + "onset_time": onset_time, + "task_duration": 12.0, + "task_block_duration": 20.0, + "sim_length": sim_length, + "BOLD_period": BOLD_period, + "TAVG_period": TAVG_period, + "num_stimulated_regions": 15, "global_conn_coupling_coef": global_conn_coupling_coef, "D": D, "conn_speed": conn_speed, @@ -105,6 +137,7 @@ "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, + "num_stimulated_regions": 5, "global_conn_coupling_coef": global_conn_coupling_coef, "D": D * 100, "conn_speed": conn_speed, From fa6f28390a1d9ed5893fa7444e310713cd2aa94b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 8 Jul 2024 20:04:40 -0400 Subject: [PATCH 074/401] minor fix in report --- task_dFC/generate_report.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 1b0bbb5..3db79d1 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -414,7 +414,17 @@ def plot_dFC_matrices( end_TR = int(end_time / TR_mri) start_TR_idx = np.where(np.array(TRs) >= start_TR)[0][0] end_TR_idx = np.where(np.array(TRs) <= end_TR)[0][-1] - chosen_TRs = TRs[start_TR_idx:end_TR_idx] + # if the TR_mri is low which will cause the figure to be too wide, + # we will only plot a resampled version of the dFC matrices, e.g. to make it the same as TR_mri=2s + if TR_mri < 2: + TR_step = int(2 / TR_mri) + chosen_TRs = TRs[start_TR_idx:end_TR_idx:TR_step] + # raise warning if the TR_mri is low + print( + f"TR_mri is low ({TR_mri}s), the dFC matrices will be resampled to make the figure width reasonable" + ) + else: + chosen_TRs = TRs[start_TR_idx:end_TR_idx] output_dir = f"{output_root}/subject_results/{subj}/dFC_matrices" if session is not None: From 2278e9b843fb3cded9ab50df315f713dd9a4feaa Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 9 Jul 2024 13:24:55 -0400 Subject: [PATCH 075/401] add paradigm clustering --- task_dFC/ML.py | 139 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 7 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 080efb1..e694133 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -396,11 +396,11 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95) np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + 1 ) - + num_PCs = min(num_PCs, 100) # create a pipeline with a knn model to find the best n_neighbors knn = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs), + PCA(n_components=num_PCs, svd_solver="full", whiten=False), KNeighborsClassifier(), ) # create a dictionary of all values we want to test for n_neighbors @@ -414,7 +414,7 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95) neigh = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs), + PCA(n_components=num_PCs, svd_solver="full", whiten=False), KNeighborsClassifier(n_neighbors=n_neighbors), ).fit(X_train, y_train) @@ -448,7 +448,7 @@ def random_forest_classify( # create a pipeline with a random forest model to find the best n_estimators rf = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs), + PCA(n_components=num_PCs, svd_solver="full", whiten=False), RandomForestClassifier(), ) # create a dictionary of all values we want to test for n_estimators @@ -466,7 +466,7 @@ def random_forest_classify( rf = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs), + PCA(n_components=num_PCs, svd_solver="full", whiten=False), RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth), ).fit(X_train, y_train) @@ -651,13 +651,14 @@ def task_presence_clustering( X_normalized = scaler.fit_transform(X) # PCA # find number of components that explain 95% of variance - pca = PCA() + pca = PCA(svd_solver="full", whiten=False) pca.fit(X_normalized) n_components = ( np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + 1 ) - pca = PCA(n_components=n_components) + n_components = min(n_components, 100) + pca = PCA(n_components=n_components, svd_solver="full", whiten=False) X_pca = pca.fit_transform(X_normalized) kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) labels_pred = kmeans.fit_predict(X_pca) @@ -815,6 +816,113 @@ def run_clustering( np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores) +def task_paradigm_clustering( + dFC_id, + TASKS, + RUNS, + SESSIONS, + roi_root, + dFC_root, + output_root, + normalize_dFC=True, + explained_var_threshold=0.95, +): + for session in SESSIONS: + # find SUBJECTS common to all tasks + for task_id, task in enumerate(TASKS): + if task_id == 0: + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, dFC_id=dFC_id + ) + else: + SUBJECTS = np.intersect1d( + SUBJECTS, + find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id), + ) + print(f"Number of subjects: {len(SUBJECTS)}") + + X = None + y = None + for task_id, task in enumerate(TASKS): + for run in RUNS[task]: + X_new, _, _, _, _, _, measure_name = dFC_feature_extraction( + task=task, + train_subjects=SUBJECTS, + test_subjects=[], + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred="no", + normalize_dFC=normalize_dFC, + ) + y_new = np.ones(X_new.shape[0]) * task_id + if X is None and y is None: + X = X_new + y = y_new + else: + X = np.concatenate((X, X_new), axis=0) + y = np.concatenate((y, y_new), axis=0) + + assert X.shape[0] == y.shape[0], "Number of samples do not match." + + # clustering + # apply kmeans clustering with PCA to dFC features + + n_clusters = len(TASKS) # corresponding to task paradigms + + scaler = StandardScaler() + X_normalized = scaler.fit_transform(X) + # PCA + # find number of components that explain 95% of variance + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_normalized) + n_components = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[ + 0 + ][0] + + 1 + ) + n_components = min(n_components, 100) + pca = PCA(n_components=n_components, svd_solver="full", whiten=False) + X_pca = pca.fit_transform(X_normalized) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) + labels_pred = kmeans.fit_predict(X_pca) + + # ARI score + print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") + + # visualize clustering centroids + centroids = kmeans.cluster_centers_ + centroids = pca.inverse_transform(centroids) + centroids = scaler.inverse_transform(centroids) + n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + centroids_mat = dFC_vec2mat(centroids, n_regions) + + task_paradigm_clstr_RESULTS = { + "StandardScaler": scaler, + "num_PCs": n_components, + "PCA": pca, + "kmeans": kmeans, + "ARI": adjusted_rand_score(y, labels_pred), + "centroids": centroids_mat, + "task_paradigms": TASKS, + } + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + + np.save( + f"{folder}/task_paradigm_clstr_RESULTS_{dFC_id}.npy", + task_paradigm_clstr_RESULTS, + ) + + ####################################################################################### if __name__ == "__main__": @@ -918,6 +1026,23 @@ def run_clustering( print(f"Error in clustering for dFC ID {dFC_id}: {e}") print(f"Task presence clustering finished for dFC ID {dFC_id}.") + + print(f"Task paradigm clustering started for dFC ID {dFC_id} ...") + try: + task_paradigm_clustering( + dFC_id=dFC_id, + TASKS=TASKS, + RUNS=RUNS, + SESSIONS=SESSIONS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + normalize_dFC=True, + ) + except Exception as e: + print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}") + + print(f"Task paradigm clustering finished for dFC ID {dFC_id}.") print(f"Task presence prediction finished for dFC ID {dFC_id}.") ####################################################################################### From a24931851bfe3806bb158ffa64592b388b3afa2a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 10 Jul 2024 11:06:47 -0400 Subject: [PATCH 076/401] minor fix --- task_dFC/ML.py | 1 + 1 file changed, 1 insertion(+) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index e694133..eb853a1 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -570,6 +570,7 @@ def task_presence_classification( "dFC method": list(), "Logistic regression accuracy": list(), # "KNN accuracy": list(), + "Random Forest accuracy": list(), } log_reg = log_reg_RESULT["log_reg_model"] # KNN = KNN_RESULT["KNN_model"] From c15f03416af6a7c0d0bd45b775985ea1940745a3 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 11 Jul 2024 10:48:00 -0400 Subject: [PATCH 077/401] change stimulated regions in simul --- pydfc/simul_utils.py | 29 +++++++++-------------------- simul_dFC/task_data_simulator.py | 2 +- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py index fa4631e..79a9cd6 100644 --- a/pydfc/simul_utils.py +++ b/pydfc/simul_utils.py @@ -88,8 +88,11 @@ def simulate_task_BOLD( The number of stimulated regions. The default is 5. if num_stimulated_regions is 5, the stimulated regions are: [0, 7, 13, 33, 42] - if num_stimulated_regions is 15, the stimulated regions are: - [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70] + if num_stimulated_regions is 16, the stimulated regions are: + regions = list(range(0, 76, 5)) + if num_stimulated_regions is 26, the stimulated regions are: + regions = list(range(0, 76, 3)) + else, the stimulated regions are randomly selected. """ # randomize some parameters for each subjects onset = np.random.normal(loc=onset_time, scale=0.5) # seconds @@ -102,24 +105,10 @@ def simulate_task_BOLD( # configure stimulus spatial pattern if num_stimulated_regions == 5: stimulated_regions_list = [0, 7, 13, 33, 42] - elif num_stimulated_regions == 15: - stimulated_regions_list = [ - 0, - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60, - 65, - 70, - ] + elif num_stimulated_regions == 16: + stimulated_regions_list = list(range(0, 76, 5)) + elif num_stimulated_regions == 26: + stimulated_regions_list = list(range(0, 76, 3)) else: stimulated_regions_list = np.random.choice( np.arange(76), num_stimulated_regions, replace=False diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 912bfe1..ba3b6c5 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -123,7 +123,7 @@ "sim_length": sim_length, "BOLD_period": BOLD_period, "TAVG_period": TAVG_period, - "num_stimulated_regions": 15, + "num_stimulated_regions": 26, "global_conn_coupling_coef": global_conn_coupling_coef, "D": D, "conn_speed": conn_speed, From 6b53cd4ef80a1af11b2f2f4726b607e0a56b2956 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 11 Jul 2024 11:40:15 -0400 Subject: [PATCH 078/401] add RF and GBT to report --- task_dFC/generate_report.py | 68 ++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 3db79d1..df60342 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -446,7 +446,7 @@ def plot_dFC_matrices( def plot_ML_results( - ML_root, output_root, task, run=None, session=None, ML_algorithm="KNN" + ML_root, output_root, task, run=None, session=None, ML_algorithm="Random Forest" ): """ Plot the ML results for a given task, run and session. @@ -457,16 +457,27 @@ def plot_ML_results( task: str, task name run: int, run number session: str, session name - ML_algorithm: str, ML algorithm name (default: KNN, other options: Logistic regression) + ML_algorithm: str, ML algorithm name (default: Random Forest, other options: Logistic regression, KNN, Gradient Boosting) """ + # the ML_scores files are saved as ML_scores_classify_{dFC_id}.npy + # find all the ML_scores files in the directory if session is None: - ML_scores = np.load( - f"{ML_root}/ML_scores_classify.npy", allow_pickle="TRUE" - ).item() + input_dir = f"{ML_root}" else: - ML_scores = np.load( - f"{ML_root}/{session}/ML_scores_classify.npy", allow_pickle="TRUE" - ).item() + input_dir = f"{ML_root}/{session}" + ALL_ML_SCORES = os.listdir(input_dir) + ALL_ML_SCORES = [ + score_file for score_file in ALL_ML_SCORES if "ML_scores_classify" in score_file + ] + ALL_ML_SCORES.sort() + ML_scores = None + for score_file in ALL_ML_SCORES: + ML_scores_new = np.load(f"{input_dir}/{score_file}", allow_pickle="TRUE").item() + if ML_scores is None: + ML_scores = ML_scores_new + else: + for key in ML_scores_new.keys(): + ML_scores[key].extend(ML_scores_new[key]) sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) @@ -506,6 +517,10 @@ def plot_ML_results( ML_algorithm_name = "LogReg" elif ML_algorithm == "KNN": ML_algorithm_name = "KNN" + elif ML_algorithm == "Random Forest": + ML_algorithm_name = "RF" + elif ML_algorithm == "Gradient Boosting": + ML_algorithm_name = "GBT" if run is None: plt.savefig( @@ -538,14 +553,29 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): run: int, run number session: str, session name """ + # the clustering_scores files are saved as clustering_scores_{dFC_id}.npy + # find all the clustering_scores files in the directory if session is None: - clustering_scores = np.load( - f"{ML_root}/clustering_scores.npy", allow_pickle="TRUE" - ).item() + input_dir = f"{ML_root}" else: - clustering_scores = np.load( - f"{ML_root}/{session}/clustering_scores.npy", allow_pickle="TRUE" + input_dir = f"{ML_root}/{session}" + ALL_CLUSTERING_SCORES = os.listdir(input_dir) + ALL_CLUSTERING_SCORES = [ + score_file + for score_file in ALL_CLUSTERING_SCORES + if "clustering_scores" in score_file + ] + ALL_CLUSTERING_SCORES.sort() + clustering_scores = None + for score_file in ALL_CLUSTERING_SCORES: + clustering_scores_new = np.load( + f"{input_dir}/{score_file}", allow_pickle="TRUE" ).item() + if clustering_scores is None: + clustering_scores = clustering_scores_new + else: + for key in clustering_scores_new.keys(): + clustering_scores[key].extend(clustering_scores_new[key]) sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) @@ -956,15 +986,15 @@ def create_html_report_group_results( else: classification_dir = f"{group_dir}/classification" - # display KNN classification results - file.write("

KNN

\n") + # display Random Forest classification results + file.write("

Random Forest

\n") if run is None: classification_img = ( - f"{classification_dir}/ML_results_classify_KNN_{task}.png" + f"{classification_dir}/ML_results_classify_RF_{task}.png" ) else: classification_img = ( - f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png" + f"{classification_dir}/ML_results_classify_RF_{task}_{run}.png" ) img = plt.imread(classification_img) height, width, _ = img.shape @@ -1233,10 +1263,10 @@ def create_html_report_group_results( task=task, run=run, session=session, - ML_algorithm="KNN", + ML_algorithm="Random Forest", ) except Exception as e: - print(f"Error in plotting ML results for KNN: {e}") + print(f"Error in plotting ML results for RF: {e}") try: plot_ML_results( ML_root=ML_root, From d9c8fc4eff1205d237c834a4fd817ac3b1b3e362 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 11 Jul 2024 11:46:22 -0400 Subject: [PATCH 079/401] adjust fig width in report --- task_dFC/generate_report.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index df60342..44a9835 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -158,7 +158,7 @@ def plot_roi_signals( start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / 2) fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) for i in nodes_list: @@ -209,13 +209,13 @@ def plot_event_labels( task_data = load_task_data(roi_root, subj, task, run, session) Fs_task = task_data["Fs_task"] TR_task = 1 / Fs_task - TR_mri = task_data["TR_mri"] + # TR_mri = task_data["TR_mri"] time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task start_timepoint = int(start_time / TR_task) end_timepoint = int(end_time / TR_task) # keep the figure width proportional to the number of time points - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / 2) fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) plt.plot( @@ -284,7 +284,7 @@ def plot_task_presence( start_TR = int(start_time / TR_mri) end_TR = int(end_time / TR_mri) # keep the figure width proportional to the number of time points in data - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / 2) fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) plt.plot( @@ -685,7 +685,7 @@ def plot_dFC_clustering( start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0] end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] - fig_width = int(2.5 * (end_time - start_time) / TR_mri) + fig_width = int(2.5 * (end_time - start_time) / 2) fig_width = min(fig_width, 500) plt.figure(figsize=(fig_width, 5)) time = TR_array[start_TR_idx:end_TR_idx] * TR_mri From 6f45fc5b518c557d850fa311e71fea9834ed9feb Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 15 Jul 2024 12:04:20 -0400 Subject: [PATCH 080/401] add paradigm clstr to report --- task_dFC/generate_report.py | 123 ++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 44a9835..9045b59 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -628,6 +628,91 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): plt.close() +def plot_paradigm_clustering( + ML_root, + output_root, + session=None, +): + """ + Plot the clustering results for a given task, run and session. + parameters: + ---------- + ML_root: str, path to ML results + output_root: str, path to save the figures + task: str, task name + run: int, run number + session: str, session name + """ + # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy + # find all the paradigm_clustering_RESULTS files in the directory + if session is None: + input_dir = f"{ML_root}" + else: + input_dir = f"{ML_root}/{session}" + ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir) + ALL_PARADIGM_CLUSTERING_RESULTS = [ + result_file + for result_file in ALL_PARADIGM_CLUSTERING_RESULTS + if "task_paradigm_clstr_RESULTS_" in result_file + ] + ALL_PARADIGM_CLUSTERING_RESULTS.sort() + paradigm_clustering_RESULTS = { + "dFC method": [], + "ARI score": [], + } + for result_file in ALL_PARADIGM_CLUSTERING_RESULTS: + paradigm_clustering_RESULTS_new = np.load( + f"{input_dir}/{result_file}", allow_pickle="TRUE" + ).item() + paradigm_clustering_RESULTS["dFC method"].append( + result_file[result_file.find("task_paradigm_clstr_RESULTS_") + 27 : -4] + ) + # paradigm_clustering_RESULTS["dFC method"].append( + # paradigm_clustering_RESULTS_new["dFC_method"] + # ) + paradigm_clustering_RESULTS["ARI score"].append( + paradigm_clustering_RESULTS_new["ARI"] + ) + + sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) + + sns.set_style("darkgrid") + + dataframe = pd.DataFrame(paradigm_clustering_RESULTS) + + plt.figure(figsize=(10, 5)) + g = sns.pointplot( + data=dataframe, + x="dFC method", + y="ARI score", + linestyle="none", + dodge=True, + capsize=0.1, + ) + g.axhline(0.0, color="r", linestyle="--") + if show_title: + g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) + + # save the figure + if session is None: + output_dir = f"{output_root}/group_results/paradigm_clustering" + else: + output_dir = f"{output_root}/group_results/paradigm_clustering/{session}" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + plt.savefig( + f"{output_dir}/paradigm_clustering_results.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() + + def plot_dFC_clustering( dFC_root, subj, @@ -1060,6 +1145,34 @@ def create_html_report_group_results( ) file.write("
\n") + + # paradigm clustering results + img_height = 300 + file.write("

Paradigm Clustering Results

\n") + for session in SESSIONS: + if session is not None: + file.write(f"

{session}

\n") + if session is not None: + paradigm_clustering_dir = f"{group_dir}/paradigm_clustering/{session}" + else: + paradigm_clustering_dir = f"{group_dir}/paradigm_clustering" + + # display paradigm clustering results + paradigm_clustering_img = ( + f"{paradigm_clustering_dir}/paradigm_clustering_results.png" + ) + img = plt.imread(paradigm_clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".") + file.write( + f"Paradigm clustering results\n" + ) + + file.write("
\n") + file.write("\n") file.write("\n") file.close() @@ -1229,6 +1342,7 @@ def create_html_report_group_results( except Exception as e: print(f"Error in creating html report for subject results: {e}") + # plot group results # find the common run number for all tasks for task presence features common_run = None for task in TASKS: @@ -1254,6 +1368,15 @@ def create_html_report_group_results( except Exception as e: print(f"Error in plotting task presence features: {e}") + try: + plot_paradigm_clustering( + ML_root=ML_root, + output_root=reports_root, + session=session, + ) + except Exception as e: + print(f"Error in plotting paradigm clustering: {e}") + for task in TASKS: for run in RUNS[task]: try: From cbbb149c3a82a78d7fac002e64b973fa35306023 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 15 Jul 2024 16:34:21 -0400 Subject: [PATCH 081/401] add plot centroids to report --- task_dFC/generate_report.py | 122 +++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 7 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 9045b59..24d8cc7 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -12,7 +12,13 @@ from sklearn.preprocessing import StandardScaler from pydfc import DFC, data_loader, task_utils -from pydfc.dfc_utils import TR_intersection, dFC_mat2vec, dFC_vec2mat, rank_norm +from pydfc.dfc_utils import ( + TR_intersection, + dFC_mat2vec, + dFC_vec2mat, + rank_norm, + visualize_conn_mat_dict, +) ################################# Parameters #################################### @@ -628,7 +634,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): plt.close() -def plot_paradigm_clustering( +def plot_paradigm_clustering_score( ML_root, output_root, session=None, @@ -691,7 +697,10 @@ def plot_paradigm_clustering( ) g.axhline(0.0, color="r", linestyle="--") if show_title: - g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) + g.set_title( + "Task Paradigm Clustering Performance", + fontdict={"fontsize": 10, "fontweight": "bold"}, + ) # save the figure if session is None: @@ -713,6 +722,65 @@ def plot_paradigm_clustering( plt.close() +def plot_paradigm_clstr_centroids( + ML_root, + output_root, + session=None, +): + """ """ + # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy + # find all the paradigm_clustering_RESULTS files in the directory + if session is None: + input_dir = f"{ML_root}" + else: + input_dir = f"{ML_root}/{session}" + + if session is None: + output_dir = f"{output_root}/group_results/paradigm_clustering_centroids" + else: + output_dir = ( + f"{output_root}/group_results/paradigm_clustering_centroids/{session}" + ) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir) + ALL_PARADIGM_CLUSTERING_RESULTS = [ + result_file + for result_file in ALL_PARADIGM_CLUSTERING_RESULTS + if "task_paradigm_clstr_RESULTS_" in result_file + ] + ALL_PARADIGM_CLUSTERING_RESULTS.sort() + + for result_file in ALL_PARADIGM_CLUSTERING_RESULTS: + paradigm_clustering_RESULTS_new = np.load( + f"{input_dir}/{result_file}", allow_pickle="TRUE" + ).item() + + # measure_name = paradigm_clustering_RESULTS_new["dFC_method"] + measure_name = result_file[ + result_file.find("task_paradigm_clstr_RESULTS_") + 28 : -4 + ] + centroids_mats = paradigm_clustering_RESULTS_new["centroids"] + + centroids_dict = {} + for i, centroid_mat in enumerate(centroids_mats): + centroids_dict[f"Cluster {i + 1}"] = centroid_mat + + visualize_conn_mat_dict( + data=centroids_dict, + title=f"Task Paradigm Centroids {measure_name}", + cmap="seismic", + normalize=True, + disp_diag=False, + save_image=True, + output_root=output_dir, + center_0=True, + # node_networks=None, + ) + + def plot_dFC_clustering( dFC_root, subj, @@ -1147,7 +1215,6 @@ def create_html_report_group_results( file.write("
\n") # paradigm clustering results - img_height = 300 file.write("

Paradigm Clustering Results

\n") for session in SESSIONS: if session is not None: @@ -1157,7 +1224,9 @@ def create_html_report_group_results( else: paradigm_clustering_dir = f"{group_dir}/paradigm_clustering" - # display paradigm clustering results + # display paradigm clustering scores + img_height = 300 + file.write("

Paradigm Clustering Scores

\n") paradigm_clustering_img = ( f"{paradigm_clustering_dir}/paradigm_clustering_results.png" ) @@ -1173,6 +1242,36 @@ def create_html_report_group_results( file.write("
\n") + # display paradigm clustering centroids + img_height = 300 + file.write("

Paradigm Clustering Centroids

\n") + # find all png files in the directory + paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids" + for file_name in os.listdir(paradigm_clustering_centroids_dir): + if file_name.endswith(".png"): + measure_name = file_name[ + file_name.find("Task_Paradigm_Centroids_") + 25 : -4 + ] + file.write(f"

{measure_name}

\n") + paradigm_clustering_centroids_img = ( + f"{paradigm_clustering_centroids_dir}/{file_name}" + ) + # get the original size of the image + img = plt.imread(paradigm_clustering_centroids_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + paradigm_clustering_centroids_img = ( + paradigm_clustering_centroids_img.replace(group_dir, ".") + ) + file.write( + f"Paradigm clustering centroids\n" + ) + file.write("
\n") + + file.write("
\n") + file.write("\n") file.write("\n") file.close() @@ -1369,13 +1468,22 @@ def create_html_report_group_results( print(f"Error in plotting task presence features: {e}") try: - plot_paradigm_clustering( + plot_paradigm_clustering_score( + ML_root=ML_root, + output_root=reports_root, + session=session, + ) + except Exception as e: + print(f"Error in plotting paradigm clustering scores: {e}") + + try: + plot_paradigm_clstr_centroids( ML_root=ML_root, output_root=reports_root, session=session, ) except Exception as e: - print(f"Error in plotting paradigm clustering: {e}") + print(f"Error in plotting paradigm clustering centroids: {e}") for task in TASKS: for run in RUNS[task]: From 25c96028744baeabd374140625a5b1cc2e26cb4f Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 15 Jul 2024 17:27:49 -0400 Subject: [PATCH 082/401] minor fix --- task_dFC/generate_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 24d8cc7..f1c7a4c 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -775,7 +775,7 @@ def plot_paradigm_clstr_centroids( normalize=True, disp_diag=False, save_image=True, - output_root=output_dir, + output_root=f"{output_dir}/", center_0=True, # node_networks=None, ) From cb0a1b490575d060380f5e85d31647e6adc5dd70 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 15 Jul 2024 17:50:14 -0400 Subject: [PATCH 083/401] switch to GBT in ML --- task_dFC/ML.py | 106 +++++++++++++++++++++++++++++++----- task_dFC/generate_report.py | 2 +- 2 files changed, 94 insertions(+), 14 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index eb853a1..31eee54 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -5,7 +5,7 @@ import numpy as np from sklearn.cluster import KMeans from sklearn.decomposition import PCA -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV @@ -482,6 +482,62 @@ def random_forest_classify( return RESULT +def gradient_boosting_classify( + X_train, y_train, X_test, y_test, explained_var_threshold=0.95 +): + """ + Gradient Boosting classification + """ + # find num_PCs + pca = PCA(svd_solver="full", whiten=False) + pca.fit(X_train) + num_PCs = ( + np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] + + 1 + ) + num_PCs = min(num_PCs, 100) + + # create a pipeline with a gradient boosting model to find the best n_estimators + gb = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs, svd_solver="full", whiten=False), + GradientBoostingClassifier(), + ) + # create a dictionary of all values we want to test for n_estimators + param_grid = { + "gradientboostingclassifier__n_estimators": [10, 50, 100, 200], + "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2], + "gradientboostingclassifier__max_depth": [3, 5, 10], + } + # use gridsearch to test all values for n_estimators + gb_gscv = GridSearchCV(gb, param_grid, cv=5) + # fit model to data + gb_gscv.fit(X_train, y_train) + + n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"] + learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"] + max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"] + + gb = make_pipeline( + StandardScaler(), + PCA(n_components=num_PCs, svd_solver="full", whiten=False), + GradientBoostingClassifier( + n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate + ), + ).fit(X_train, y_train) + + RESULT = { + "GB_pca": pca, + "GB_num_PCs": num_PCs, + "GB_cv_results": gb_gscv.cv_results_, + "GB_model": gb, + "GB_train_score": gb.score(X_train, y_train), + "GB_test_score": gb.score(X_test, y_test), + } + + return RESULT + + def task_presence_classification( task, dFC_id, @@ -495,7 +551,7 @@ def task_presence_classification( explained_var_threshold=0.95, ): """ - perform task presence classification using logistic regression, KNN, or Random Forest + perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting for a given task and dFC method and run. """ if run is None: @@ -547,8 +603,13 @@ def task_presence_classification( # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold # ) - # Random Forest - RF_RESULT = random_forest_classify( + # # Random Forest + # RF_RESULT = random_forest_classify( + # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold + # ) + + # Gradient Boosting + GBT_RESULT = gradient_boosting_classify( X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold ) @@ -557,8 +618,10 @@ def task_presence_classification( ML_RESULT[key] = log_reg_RESULT[key] # for key in KNN_RESULT: # ML_RESULT[key] = KNN_RESULT[key] - for key in RF_RESULT: - ML_RESULT[key] = RF_RESULT[key] + # for key in RF_RESULT: + # ML_RESULT[key] = RF_RESULT[key] + for key in GBT_RESULT: + ML_RESULT[key] = GBT_RESULT[key] # measure pred score on each subj @@ -570,11 +633,13 @@ def task_presence_classification( "dFC method": list(), "Logistic regression accuracy": list(), # "KNN accuracy": list(), - "Random Forest accuracy": list(), + # "Random Forest accuracy": list(), + "Gradient Boosting accuracy": list(), } log_reg = log_reg_RESULT["log_reg_model"] # KNN = KNN_RESULT["KNN_model"] - RF = RF_RESULT["RF_model"] + # RF = RF_RESULT["RF_model"] + GBT = GBT_RESULT["GB_model"] for subj in SUBJECTS: ML_scores["subj_id"].append(subj) @@ -589,14 +654,18 @@ def task_presence_classification( pred_lr = log_reg.predict(features) # pred_KNN = KNN.predict(features) - pred_RF = RF.predict(features) + # pred_RF = RF.predict(features) + pred_GBT = GBT.predict(features) ML_scores["Logistic regression accuracy"].append( balanced_accuracy_score(target, pred_lr) ) # ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) - ML_scores["Random Forest accuracy"].append( - balanced_accuracy_score(target, pred_RF) + # ML_scores["Random Forest accuracy"].append( + # balanced_accuracy_score(target, pred_RF) + # ) + ML_scores["Gradient Boosting accuracy"].append( + balanced_accuracy_score(target, pred_GBT) ) ML_scores["task"].append(task) @@ -730,7 +799,8 @@ def run_classification( "dFC method": list(), "Logistic regression accuracy": list(), # "KNN accuracy": list(), - "Random Forest accuracy": list(), + # "Random Forest accuracy": list(), + "Gradient Boosting accuracy": list(), } ML_RESULT = {} @@ -844,9 +914,10 @@ def task_paradigm_clustering( X = None y = None + measure_name = None for task_id, task in enumerate(TASKS): for run in RUNS[task]: - X_new, _, _, _, _, _, measure_name = dFC_feature_extraction( + X_new, _, _, _, _, _, measure_name_new = dFC_feature_extraction( task=task, train_subjects=SUBJECTS, test_subjects=[], @@ -858,6 +929,14 @@ def task_paradigm_clustering( dynamic_pred="no", normalize_dFC=normalize_dFC, ) + + if measure_name is not None: + assert ( + measure_name == measure_name_new + ), "dFC measure is not consistent." + else: + measure_name = measure_name_new + y_new = np.ones(X_new.shape[0]) * task_id if X is None and y is None: X = X_new @@ -902,6 +981,7 @@ def task_paradigm_clustering( centroids_mat = dFC_vec2mat(centroids, n_regions) task_paradigm_clstr_RESULTS = { + "dFC_method": measure_name, "StandardScaler": scaler, "num_PCs": n_components, "PCA": pca, diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index f1c7a4c..5c56b02 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -1250,7 +1250,7 @@ def create_html_report_group_results( for file_name in os.listdir(paradigm_clustering_centroids_dir): if file_name.endswith(".png"): measure_name = file_name[ - file_name.find("Task_Paradigm_Centroids_") + 25 : -4 + file_name.find("Task_Paradigm_Centroids_") + 24 : -4 ] file.write(f"

{measure_name}

\n") paradigm_clustering_centroids_img = ( From 719d7a90306ca4aa533759b4a13789bc547ad1a7 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 15 Jul 2024 18:05:14 -0400 Subject: [PATCH 084/401] add run_simulator --- simul_dFC/run_scripts/run_simulator.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 simul_dFC/run_scripts/run_simulator.sh diff --git a/simul_dFC/run_scripts/run_simulator.sh b/simul_dFC/run_scripts/run_simulator.sh new file mode 100644 index 0000000..e7f6394 --- /dev/null +++ b/simul_dFC/run_scripts/run_simulator.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# +#$ -cwd +#$ -j y +#$ -o logs/simul_out.txt +#$ -e logs/simul_err.txt +#$ -q origami.q +#$ -l h_vmem=8G +#$ -t 1:200 + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py" +conda deactivate From 80bef3a78d3661409603e76bf9a8570bb3074b6a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 18 Jul 2024 17:27:27 -0400 Subject: [PATCH 085/401] add SI to ML --- task_dFC/ML.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 31eee54..a06e686 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -7,7 +7,7 @@ from sklearn.decomposition import PCA from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.linear_model import LogisticRegression -from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score +from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline @@ -758,6 +758,8 @@ def task_presence_clustering( "run": list(), "dFC method": list(), "Kmeans ARI": list(), + "SI": list(), + "SI_pca": list(), } for subj in SUBJECTS: clustering_scores["subj_id"].append(subj) @@ -770,6 +772,11 @@ def task_presence_clustering( clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans)) + # silhouette score in terms of separability of original labels, not the clustering labels + # using both original features and PCA features + clustering_scores["SI"].append(silhouette_score(features, target)) + clustering_scores["SI_pca"].append(silhouette_score(features_pca, target)) + clustering_scores["task"].append(task) clustering_scores["run"].append(run) clustering_scores["dFC method"].append(measure_name) @@ -854,6 +861,8 @@ def run_clustering( "run": list(), "dFC method": list(), "Kmeans ARI": list(), + "SI": list(), + "SI_pca": list(), } clustering_RESULTS = {} @@ -987,6 +996,8 @@ def task_paradigm_clustering( "PCA": pca, "kmeans": kmeans, "ARI": adjusted_rand_score(y, labels_pred), + "SI": silhouette_score(X, y), + "SI_pca": silhouette_score(X_pca, y), "centroids": centroids_mat, "task_paradigms": TASKS, } From 8cfef3fd7dbaaad6db733dc98eb7e50dd3f9ddde Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 29 Jul 2024 15:24:22 -0400 Subject: [PATCH 086/401] change RF to GBT in report --- task_dFC/generate_report.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 5c56b02..2271ccf 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -1140,14 +1140,14 @@ def create_html_report_group_results( classification_dir = f"{group_dir}/classification" # display Random Forest classification results - file.write("

Random Forest

\n") + file.write("

Gradient Boosting

\n") if run is None: classification_img = ( - f"{classification_dir}/ML_results_classify_RF_{task}.png" + f"{classification_dir}/ML_results_classify_GBT_{task}.png" ) else: classification_img = ( - f"{classification_dir}/ML_results_classify_RF_{task}_{run}.png" + f"{classification_dir}/ML_results_classify_GBT_{task}_{run}.png" ) img = plt.imread(classification_img) height, width, _ = img.shape @@ -1494,10 +1494,10 @@ def create_html_report_group_results( task=task, run=run, session=session, - ML_algorithm="Random Forest", + ML_algorithm="Gradient Boosting", ) except Exception as e: - print(f"Error in plotting ML results for RF: {e}") + print(f"Error in plotting ML results for GBT: {e}") try: plot_ML_results( ML_root=ML_root, From 196b2f13095ce5ee0ad4804502918c3930bb0baf Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 29 Jul 2024 18:28:38 -0400 Subject: [PATCH 087/401] change ML from GBT to KNN --- task_dFC/ML.py | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index a06e686..3779cdf 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -598,30 +598,30 @@ def task_presence_classification( # logistic regression log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test) - # # KNN - # KNN_RESULT = KNN_classify( - # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold - # ) + # KNN + KNN_RESULT = KNN_classify( + X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold + ) # # Random Forest # RF_RESULT = random_forest_classify( # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold # ) - # Gradient Boosting - GBT_RESULT = gradient_boosting_classify( - X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold - ) + # # Gradient Boosting + # GBT_RESULT = gradient_boosting_classify( + # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold + # ) ML_RESULT = {} for key in log_reg_RESULT: ML_RESULT[key] = log_reg_RESULT[key] - # for key in KNN_RESULT: - # ML_RESULT[key] = KNN_RESULT[key] + for key in KNN_RESULT: + ML_RESULT[key] = KNN_RESULT[key] # for key in RF_RESULT: # ML_RESULT[key] = RF_RESULT[key] - for key in GBT_RESULT: - ML_RESULT[key] = GBT_RESULT[key] + # for key in GBT_RESULT: + # ML_RESULT[key] = GBT_RESULT[key] # measure pred score on each subj @@ -632,14 +632,14 @@ def task_presence_classification( "run": list(), "dFC method": list(), "Logistic regression accuracy": list(), - # "KNN accuracy": list(), + "KNN accuracy": list(), # "Random Forest accuracy": list(), - "Gradient Boosting accuracy": list(), + # "Gradient Boosting accuracy": list(), } log_reg = log_reg_RESULT["log_reg_model"] - # KNN = KNN_RESULT["KNN_model"] + KNN = KNN_RESULT["KNN_model"] # RF = RF_RESULT["RF_model"] - GBT = GBT_RESULT["GB_model"] + # GBT = GBT_RESULT["GB_model"] for subj in SUBJECTS: ML_scores["subj_id"].append(subj) @@ -653,20 +653,20 @@ def task_presence_classification( target = y_test[subj_label_test == subj] pred_lr = log_reg.predict(features) - # pred_KNN = KNN.predict(features) + pred_KNN = KNN.predict(features) # pred_RF = RF.predict(features) - pred_GBT = GBT.predict(features) + # pred_GBT = GBT.predict(features) ML_scores["Logistic regression accuracy"].append( balanced_accuracy_score(target, pred_lr) ) - # ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) # ML_scores["Random Forest accuracy"].append( # balanced_accuracy_score(target, pred_RF) # ) - ML_scores["Gradient Boosting accuracy"].append( - balanced_accuracy_score(target, pred_GBT) - ) + # ML_scores["Gradient Boosting accuracy"].append( + # balanced_accuracy_score(target, pred_GBT) + # ) ML_scores["task"].append(task) ML_scores["run"].append(run) @@ -805,9 +805,9 @@ def run_classification( "run": list(), "dFC method": list(), "Logistic regression accuracy": list(), - # "KNN accuracy": list(), + "KNN accuracy": list(), # "Random Forest accuracy": list(), - "Gradient Boosting accuracy": list(), + # "Gradient Boosting accuracy": list(), } ML_RESULT = {} From 6d0f1e8f07335ce71b38ae3c18efb617d0204f2b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 29 Jul 2024 18:28:54 -0400 Subject: [PATCH 088/401] minor change --- task_dFC/generate_report.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 2271ccf..2002bf6 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -671,11 +671,8 @@ def plot_paradigm_clustering_score( f"{input_dir}/{result_file}", allow_pickle="TRUE" ).item() paradigm_clustering_RESULTS["dFC method"].append( - result_file[result_file.find("task_paradigm_clstr_RESULTS_") + 27 : -4] + paradigm_clustering_RESULTS_new["dFC_method"] ) - # paradigm_clustering_RESULTS["dFC method"].append( - # paradigm_clustering_RESULTS_new["dFC_method"] - # ) paradigm_clustering_RESULTS["ARI score"].append( paradigm_clustering_RESULTS_new["ARI"] ) @@ -758,10 +755,7 @@ def plot_paradigm_clstr_centroids( f"{input_dir}/{result_file}", allow_pickle="TRUE" ).item() - # measure_name = paradigm_clustering_RESULTS_new["dFC_method"] - measure_name = result_file[ - result_file.find("task_paradigm_clstr_RESULTS_") + 28 : -4 - ] + measure_name = paradigm_clustering_RESULTS_new["dFC_method"] centroids_mats = paradigm_clustering_RESULTS_new["centroids"] centroids_dict = {} From 6aa93598a6b0aeb3b29920130baeb79fe769e77f Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 30 Jul 2024 14:20:15 -0400 Subject: [PATCH 089/401] add manifold learning --- task_dFC/ML.py | 309 ++++++++++++++++++++++++++++++------------------- 1 file changed, 191 insertions(+), 118 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 3779cdf..1db529a 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -3,10 +3,12 @@ import os import numpy as np +from scipy.spatial import procrustes from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.linear_model import LogisticRegression +from sklearn.manifold import SpectralEmbedding from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.neighbors import KNeighborsClassifier @@ -245,6 +247,101 @@ def load_task_data(roi_root, subj, task, run=None, session=None): return task_data +def embed_dFC_features( + train_subjects, + test_subjects, + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + embedding="PCA", + n_components=30, + n_neighbors_LE=90, +): + """ + Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. + + for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects. + All the subjects are transformed into the space of the subject with the highest silhouette score. + """ + if embedding == "PCA": + pca = PCA(n_components=n_components, svd_solver="full", whiten=False) + pca.fit(X_train) + X_train_embed = pca.transform(X_train) + X_test_embed = pca.transform(X_test) + elif embedding == "LE": + # first embed the dFC features of each subject into a lower dimensional space using LE separately + embed_dict = {} + for subject in train_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_train == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_train[subj_label_train == subject, :] + y_subj = y_train[subj_label_train == subject] + LE = SpectralEmbedding( + n_components=n_components, + n_neighbors=n_neighbors_LE, + ) + X_subj_embed = LE.fit_transform(X_subj) + SI = silhouette_score(X_subj_embed, y_subj) + embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} + + # find the best transformation based on the SI score + best_SI = -1 + best_subject = None + for subject in embed_dict: + if embed_dict[subject]["SI"] > best_SI: + best_SI = embed_dict[subject]["SI"] + best_subject = subject + + # apply procrustes transformation to align the embeddings of different subjects + # use the embeddings of the subject with the highest SI score as the reference + X_train_embed = None + for subject in train_subjects: + X_subj_embed = embed_dict[subject]["X_subj_embed"] + # procrustes transformation + if subject == best_subject: + X_subj_embed_transformed = X_subj_embed + else: + _, X_subj_embed_transformed, _ = procrustes( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + if X_train_embed is None: + X_train_embed = X_subj_embed_transformed + else: + X_train_embed = np.concatenate( + (X_train_embed, X_subj_embed_transformed), axis=0 + ) + + # apply the same transformation to the test set + X_test_embed = None + for subject in test_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_test == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_test[subj_label_test == subject, :] + LE = SpectralEmbedding( + n_components=n_components, + n_neighbors=n_neighbors_LE, + ) + X_subj_embed = LE.fit_transform(X_subj) + _, X_subj_embed_transformed, _ = procrustes( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + if X_test_embed is None: + X_test_embed = X_subj_embed_transformed + else: + X_test_embed = np.concatenate( + (X_test_embed, X_subj_embed_transformed), axis=0 + ) + + return X_train_embed, X_test_embed + + def dFC_feature_extraction( task, train_subjects, @@ -385,22 +482,13 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test): return RESULT -def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95): +def KNN_classify(X_train, y_train, X_test, y_test): """ KNN classification """ - # find num_PCs - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_train) - num_PCs = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] - + 1 - ) - num_PCs = min(num_PCs, 100) # create a pipeline with a knn model to find the best n_neighbors knn = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs, svd_solver="full", whiten=False), KNeighborsClassifier(), ) # create a dictionary of all values we want to test for n_neighbors @@ -414,13 +502,10 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95) neigh = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs, svd_solver="full", whiten=False), KNeighborsClassifier(n_neighbors=n_neighbors), ).fit(X_train, y_train) RESULT = { - "KNN_pca": pca, - "KNN_num_PCs": num_PCs, "KNN_cv_results": knn_gscv.cv_results_, "KNN_model": neigh, "KNN_train_score": neigh.score(X_train, y_train), @@ -430,25 +515,13 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95) return RESULT -def random_forest_classify( - X_train, y_train, X_test, y_test, explained_var_threshold=0.95 -): +def random_forest_classify(X_train, y_train, X_test, y_test): """ Random Forest classification """ - # find num_PCs - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_train) - num_PCs = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] - + 1 - ) - num_PCs = min(num_PCs, 100) - # create a pipeline with a random forest model to find the best n_estimators rf = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs, svd_solver="full", whiten=False), RandomForestClassifier(), ) # create a dictionary of all values we want to test for n_estimators @@ -466,13 +539,10 @@ def random_forest_classify( rf = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs, svd_solver="full", whiten=False), RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth), ).fit(X_train, y_train) RESULT = { - "RF_pca": pca, - "RF_num_PCs": num_PCs, "RF_cv_results": rf_gscv.cv_results_, "RF_model": rf, "RF_train_score": rf.score(X_train, y_train), @@ -482,25 +552,13 @@ def random_forest_classify( return RESULT -def gradient_boosting_classify( - X_train, y_train, X_test, y_test, explained_var_threshold=0.95 -): +def gradient_boosting_classify(X_train, y_train, X_test, y_test): """ Gradient Boosting classification """ - # find num_PCs - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_train) - num_PCs = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] - + 1 - ) - num_PCs = min(num_PCs, 100) - # create a pipeline with a gradient boosting model to find the best n_estimators gb = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs, svd_solver="full", whiten=False), GradientBoostingClassifier(), ) # create a dictionary of all values we want to test for n_estimators @@ -520,15 +578,12 @@ def gradient_boosting_classify( gb = make_pipeline( StandardScaler(), - PCA(n_components=num_PCs, svd_solver="full", whiten=False), GradientBoostingClassifier( n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate ), ).fit(X_train, y_train) RESULT = { - "GB_pca": pca, - "GB_num_PCs": num_PCs, "GB_cv_results": gb_gscv.cv_results_, "GB_model": gb, "GB_train_score": gb.score(X_train, y_train), @@ -548,7 +603,6 @@ def task_presence_classification( dynamic_pred="no", normalize_dFC=True, train_test_ratio=0.8, - explained_var_threshold=0.95, ): """ perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting @@ -591,6 +645,21 @@ def task_presence_classification( ) ) + # embed dFC features + X_train, X_test = embed_dFC_features( + train_subjects=train_subjects, + test_subjects=test_subjects, + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test, + subj_label_train=subj_label_train, + subj_label_test=subj_label_test, + embedding="LE", + n_components=30, + n_neighbors_LE=90, + ) + # task presence classification print("task presence classification ...") @@ -599,18 +668,16 @@ def task_presence_classification( log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test) # KNN - KNN_RESULT = KNN_classify( - X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold - ) + KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test) # # Random Forest # RF_RESULT = random_forest_classify( - # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold + # X_train, y_train, X_test, y_test # ) # # Gradient Boosting # GBT_RESULT = gradient_boosting_classify( - # X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold + # X_train, y_train, X_test, y_test # ) ML_RESULT = {} @@ -683,7 +750,6 @@ def task_presence_clustering( run=None, session=None, normalize_dFC=True, - explained_var_threshold=0.95, ): if run is None: print(f"=============== {task} ===============") @@ -712,44 +778,46 @@ def task_presence_clustering( normalize_dFC=normalize_dFC, ) + # embed dFC features + X, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding="LE", + n_components=30, + n_neighbors_LE=90, + ) + # clustering - # apply kmeans clustering with PCA to dFC features + # apply kmeans clustering to dFC features n_clusters = 2 # corresponding to task and rest scaler = StandardScaler() X_normalized = scaler.fit_transform(X) - # PCA - # find number of components that explain 95% of variance - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_normalized) - n_components = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] - + 1 - ) - n_components = min(n_components, 100) - pca = PCA(n_components=n_components, svd_solver="full", whiten=False) - X_pca = pca.fit_transform(X_normalized) kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) - labels_pred = kmeans.fit_predict(X_pca) + labels_pred = kmeans.fit_predict(X_normalized) # ARI score print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") - # visualize clustering centroids - centroids = kmeans.cluster_centers_ - centroids = pca.inverse_transform(centroids) - centroids = scaler.inverse_transform(centroids) - n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) - centroids_mat = dFC_vec2mat(centroids, n_regions) + # # visualize clustering centroids + # centroids = kmeans.cluster_centers_ + # centroids = pca.inverse_transform(centroids) + # centroids = scaler.inverse_transform(centroids) + # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + # centroids_mat = dFC_vec2mat(centroids, n_regions) clustering_RESULTS = { "StandardScaler": scaler, - "num_PCs": n_components, - "PCA": pca, "kmeans": kmeans, "ARI": adjusted_rand_score(y, labels_pred), - "centroids": centroids_mat, + # "centroids": centroids_mat, } clustering_scores = { @@ -759,7 +827,6 @@ def task_presence_clustering( "dFC method": list(), "Kmeans ARI": list(), "SI": list(), - "SI_pca": list(), } for subj in SUBJECTS: clustering_scores["subj_id"].append(subj) @@ -767,15 +834,12 @@ def task_presence_clustering( target = y[subj_label == subj] features_normalized = scaler.transform(features) - features_pca = pca.transform(features_normalized) - pred_kmeans = kmeans.predict(features_pca) + pred_kmeans = kmeans.predict(features_normalized) clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans)) # silhouette score in terms of separability of original labels, not the clustering labels - # using both original features and PCA features clustering_scores["SI"].append(silhouette_score(features, target)) - clustering_scores["SI_pca"].append(silhouette_score(features_pca, target)) clustering_scores["task"].append(task) clustering_scores["run"].append(run) @@ -862,7 +926,6 @@ def run_clustering( "dFC method": list(), "Kmeans ARI": list(), "SI": list(), - "SI_pca": list(), } clustering_RESULTS = {} @@ -905,7 +968,6 @@ def task_paradigm_clustering( dFC_root, output_root, normalize_dFC=True, - explained_var_threshold=0.95, ): for session in SESSIONS: # find SUBJECTS common to all tasks @@ -923,20 +985,23 @@ def task_paradigm_clustering( X = None y = None + subj_label = None measure_name = None for task_id, task in enumerate(TASKS): for run in RUNS[task]: - X_new, _, _, _, _, _, measure_name_new = dFC_feature_extraction( - task=task, - train_subjects=SUBJECTS, - test_subjects=[], - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - dynamic_pred="no", - normalize_dFC=normalize_dFC, + X_new, _, _, _, subj_label_new, _, measure_name_new = ( + dFC_feature_extraction( + task=task, + train_subjects=SUBJECTS, + test_subjects=[], + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred="no", + normalize_dFC=normalize_dFC, + ) ) if measure_name is not None: @@ -950,55 +1015,63 @@ def task_paradigm_clustering( if X is None and y is None: X = X_new y = y_new + subj_label = subj_label_new else: X = np.concatenate((X, X_new), axis=0) y = np.concatenate((y, y_new), axis=0) + subj_label = np.concatenate((subj_label, subj_label_new), axis=0) assert X.shape[0] == y.shape[0], "Number of samples do not match." + assert X.shape[0] == subj_label.shape[0], "Number of samples do not match." + + # rearrange the order of the samples so that the samples of the same subject are together + idx = np.argsort(subj_label) + X = X[idx, :] + y = y[idx] + subj_label = subj_label[idx] + + # embed dFC features + X, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding="LE", + n_components=30, + n_neighbors_LE=90, + ) # clustering - # apply kmeans clustering with PCA to dFC features + # apply kmeans clustering to dFC features n_clusters = len(TASKS) # corresponding to task paradigms scaler = StandardScaler() X_normalized = scaler.fit_transform(X) - # PCA - # find number of components that explain 95% of variance - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_normalized) - n_components = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[ - 0 - ][0] - + 1 - ) - n_components = min(n_components, 100) - pca = PCA(n_components=n_components, svd_solver="full", whiten=False) - X_pca = pca.fit_transform(X_normalized) kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) - labels_pred = kmeans.fit_predict(X_pca) + labels_pred = kmeans.fit_predict(X_normalized) # ARI score print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") - # visualize clustering centroids - centroids = kmeans.cluster_centers_ - centroids = pca.inverse_transform(centroids) - centroids = scaler.inverse_transform(centroids) - n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) - centroids_mat = dFC_vec2mat(centroids, n_regions) + # # visualize clustering centroids + # centroids = kmeans.cluster_centers_ + # centroids = pca.inverse_transform(centroids) + # centroids = scaler.inverse_transform(centroids) + # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + # centroids_mat = dFC_vec2mat(centroids, n_regions) task_paradigm_clstr_RESULTS = { "dFC_method": measure_name, "StandardScaler": scaler, - "num_PCs": n_components, - "PCA": pca, "kmeans": kmeans, "ARI": adjusted_rand_score(y, labels_pred), - "SI": silhouette_score(X, y), - "SI_pca": silhouette_score(X_pca, y), - "centroids": centroids_mat, + "SI": silhouette_score(X_normalized, y), + # "centroids": centroids_mat, "task_paradigms": TASKS, } From bad1003f7bdaa0122faeaa958d12755221140794 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 30 Jul 2024 16:11:58 -0400 Subject: [PATCH 090/401] minor change --- task_dFC/ML.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 1db529a..a8d000e 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -1,6 +1,7 @@ import argparse import json import os +import traceback import numpy as np from scipy.spatial import procrustes @@ -258,7 +259,7 @@ def embed_dFC_features( subj_label_test, embedding="PCA", n_components=30, - n_neighbors_LE=90, + n_neighbors_LE=100, ): """ Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. @@ -657,7 +658,7 @@ def task_presence_classification( subj_label_test=subj_label_test, embedding="LE", n_components=30, - n_neighbors_LE=90, + n_neighbors_LE=100, ) # task presence classification @@ -790,7 +791,7 @@ def task_presence_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=90, + n_neighbors_LE=100, ) # clustering @@ -1042,7 +1043,7 @@ def task_paradigm_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=90, + n_neighbors_LE=100, ) # clustering @@ -1174,6 +1175,7 @@ def task_paradigm_clustering( ) except Exception as e: print(f"Error in classification for dFC ID {dFC_id}: {e}") + traceback.print_exc() print(f"Task presence classification finished for dFC ID {dFC_id}.") print(f"Task presence clustering started for dFC ID {dFC_id} ...") try: @@ -1189,6 +1191,7 @@ def task_paradigm_clustering( ) except Exception as e: print(f"Error in clustering for dFC ID {dFC_id}: {e}") + traceback.print_exc() print(f"Task presence clustering finished for dFC ID {dFC_id}.") @@ -1206,6 +1209,7 @@ def task_paradigm_clustering( ) except Exception as e: print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}") + traceback.print_exc() print(f"Task paradigm clustering finished for dFC ID {dFC_id}.") print(f"Task presence prediction finished for dFC ID {dFC_id}.") From 868cbea000418acaaa329c621c6934c2fbd820a4 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 30 Jul 2024 16:51:01 -0400 Subject: [PATCH 091/401] change in LE --- task_dFC/ML.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index a8d000e..9a6c0f6 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -259,7 +259,7 @@ def embed_dFC_features( subj_label_test, embedding="PCA", n_components=30, - n_neighbors_LE=100, + n_neighbors_LE=110, ): """ Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. @@ -307,8 +307,37 @@ def embed_dFC_features( if subject == best_subject: X_subj_embed_transformed = X_subj_embed else: + # for the procrustes transformation, the number of samples should be the same + if ( + X_subj_embed.shape[0] + > embed_dict[best_subject]["X_subj_embed"].shape[0] + ): + # add zero rows to the embedding of the best subject + X_best_subj_embed = np.concatenate( + ( + embed_dict[best_subject]["X_subj_embed"], + np.zeros( + ( + X_subj_embed.shape[0] + - embed_dict[best_subject]["X_subj_embed"].shape[0], + n_components, + ) + ), + ), + axis=0, + ) + elif ( + X_subj_embed.shape[0] + < embed_dict[best_subject]["X_subj_embed"].shape[0] + ): + # remove extra rows from the embedding of the best subject + X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"][ + : X_subj_embed.shape[0], : + ] + else: + X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"] _, X_subj_embed_transformed, _ = procrustes( - embed_dict[best_subject]["X_subj_embed"], X_subj_embed + X_best_subj_embed, X_subj_embed ) if X_train_embed is None: X_train_embed = X_subj_embed_transformed @@ -658,7 +687,7 @@ def task_presence_classification( subj_label_test=subj_label_test, embedding="LE", n_components=30, - n_neighbors_LE=100, + n_neighbors_LE=110, ) # task presence classification @@ -791,7 +820,7 @@ def task_presence_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=100, + n_neighbors_LE=110, ) # clustering @@ -1043,7 +1072,7 @@ def task_paradigm_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=100, + n_neighbors_LE=110, ) # clustering From 6f2f831f18d6d032aa3f64a6483f4c5f8459571b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 30 Jul 2024 19:57:27 -0400 Subject: [PATCH 092/401] minor change --- task_dFC/ML.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 9a6c0f6..eace7ea 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -259,7 +259,7 @@ def embed_dFC_features( subj_label_test, embedding="PCA", n_components=30, - n_neighbors_LE=110, + n_neighbors_LE=150, ): """ Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. @@ -687,7 +687,7 @@ def task_presence_classification( subj_label_test=subj_label_test, embedding="LE", n_components=30, - n_neighbors_LE=110, + n_neighbors_LE=150, ) # task presence classification @@ -820,7 +820,7 @@ def task_presence_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=110, + n_neighbors_LE=150, ) # clustering @@ -1072,7 +1072,7 @@ def task_paradigm_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=110, + n_neighbors_LE=150, ) # clustering From a6a630e60c0cb5dfcd00d2bf3eb82c9120b2e55a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 15:13:08 -0400 Subject: [PATCH 093/401] change l2 to l1 in logreg --- task_dFC/ML.py | 78 ++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index eace7ea..dad5296 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -248,6 +248,36 @@ def load_task_data(roi_root, subj, task, run=None, session=None): return task_data +def precheck_for_procruste(X_best, X_subj): + """ + Check if the two matrices have the same number of rows. if not, make them the same. + """ + # for the procrustes transformation, the number of samples should be the same + if X_subj.shape[0] > X_best.shape[0]: + # add zero rows to the embedding of the best subject + X_best_new = np.concatenate( + ( + X_best, + np.zeros( + ( + X_subj.shape[0] - X_best.shape[0], + X_best.shape[1], + ) + ), + ), + axis=0, + ) + elif X_subj.shape[0] < X_best.shape[0]: + # remove extra rows from the embedding of the best subject + X_best_new = X_best[: X_subj.shape[0], :] + else: + X_best_new = X_best + + X_best_new = X_best_new.copy() + + return X_best_new + + def embed_dFC_features( train_subjects, test_subjects, @@ -259,7 +289,7 @@ def embed_dFC_features( subj_label_test, embedding="PCA", n_components=30, - n_neighbors_LE=150, + n_neighbors_LE=125, ): """ Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. @@ -308,34 +338,9 @@ def embed_dFC_features( X_subj_embed_transformed = X_subj_embed else: # for the procrustes transformation, the number of samples should be the same - if ( - X_subj_embed.shape[0] - > embed_dict[best_subject]["X_subj_embed"].shape[0] - ): - # add zero rows to the embedding of the best subject - X_best_subj_embed = np.concatenate( - ( - embed_dict[best_subject]["X_subj_embed"], - np.zeros( - ( - X_subj_embed.shape[0] - - embed_dict[best_subject]["X_subj_embed"].shape[0], - n_components, - ) - ), - ), - axis=0, - ) - elif ( - X_subj_embed.shape[0] - < embed_dict[best_subject]["X_subj_embed"].shape[0] - ): - # remove extra rows from the embedding of the best subject - X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"][ - : X_subj_embed.shape[0], : - ] - else: - X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"] + X_best_subj_embed = precheck_for_procruste( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) _, X_subj_embed_transformed, _ = procrustes( X_best_subj_embed, X_subj_embed ) @@ -359,9 +364,12 @@ def embed_dFC_features( n_neighbors=n_neighbors_LE, ) X_subj_embed = LE.fit_transform(X_subj) - _, X_subj_embed_transformed, _ = procrustes( + # procrustes transformation + # for the procrustes transformation, the number of samples should be the same + X_best_subj_embed = precheck_for_procruste( embed_dict[best_subject]["X_subj_embed"], X_subj_embed ) + _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed) if X_test_embed is None: X_test_embed = X_subj_embed_transformed else: @@ -487,7 +495,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test): Logistic regression classification """ # create a pipeline with a logistic regression model to find the best C - logistic_reg = make_pipeline(StandardScaler(), LogisticRegression()) + logistic_reg = make_pipeline(StandardScaler(), LogisticRegression(penalty="l1")) # create a dictionary of all values we want to test for C param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} # use gridsearch to test all values for C @@ -499,7 +507,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test): log_reg = make_pipeline( StandardScaler(), - LogisticRegression(C=C), + LogisticRegression(penalty="l1", C=C), ).fit(X_train, y_train) RESULT = { @@ -687,7 +695,7 @@ def task_presence_classification( subj_label_test=subj_label_test, embedding="LE", n_components=30, - n_neighbors_LE=150, + n_neighbors_LE=125, ) # task presence classification @@ -820,7 +828,7 @@ def task_presence_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=150, + n_neighbors_LE=125, ) # clustering @@ -1072,7 +1080,7 @@ def task_paradigm_clustering( subj_label_test=None, embedding="LE", n_components=30, - n_neighbors_LE=150, + n_neighbors_LE=125, ) # clustering From 039a9e848f3cef84700d84190f3a3b5abe544d6d Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 15:33:59 -0400 Subject: [PATCH 094/401] minor change --- task_dFC/ML.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index dad5296..41fb0ff 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -495,7 +495,9 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test): Logistic regression classification """ # create a pipeline with a logistic regression model to find the best C - logistic_reg = make_pipeline(StandardScaler(), LogisticRegression(penalty="l1")) + logistic_reg = make_pipeline( + StandardScaler(), LogisticRegression(penalty="l1", solver="saga") + ) # create a dictionary of all values we want to test for C param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} # use gridsearch to test all values for C @@ -507,7 +509,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test): log_reg = make_pipeline( StandardScaler(), - LogisticRegression(penalty="l1", C=C), + LogisticRegression(penalty="l1", C=C, solver="saga"), ).fit(X_train, y_train) RESULT = { From 13a36b995d665efb709cc3281b43f347ef3323e6 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 15:41:36 -0400 Subject: [PATCH 095/401] minor fix --- task_dFC/ML.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 41fb0ff..eb35ccd 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -314,7 +314,7 @@ def embed_dFC_features( y_subj = y_train[subj_label_train == subject] LE = SpectralEmbedding( n_components=n_components, - n_neighbors=n_neighbors_LE, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), ) X_subj_embed = LE.fit_transform(X_subj) SI = silhouette_score(X_subj_embed, y_subj) @@ -361,7 +361,7 @@ def embed_dFC_features( X_subj = X_test[subj_label_test == subject, :] LE = SpectralEmbedding( n_components=n_components, - n_neighbors=n_neighbors_LE, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), ) X_subj_embed = LE.fit_transform(X_subj) # procrustes transformation From e369e2170c6a19b92c5d8ee8644ce4e4d1bd4481 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 18:01:20 -0400 Subject: [PATCH 096/401] change nifti roi so it can handle common events files --- task_dFC/nifti_to_roi_signal.py | 77 ++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 0d65049..46c0c66 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -84,52 +84,57 @@ def run_roi_signal_extraction( num_time_mri = time_series.n_time ################################# EXTRACT TASK LABELS ######################### oversampling = 50 # more samples per TR than the func data to have a better event_labels time resolution - if task == "task-restingstate": - events = [] - event_types = ["rest"] - event_labels = np.zeros((int(num_time_mri * oversampling), 1)) - task_labels = np.zeros((int(num_time_mri * oversampling), 1)) - Fs_task = float(1 / TR_mri) * oversampling - else: - ALL_EVENTS_FILES = os.listdir(task_events_root) + + ALL_EVENTS_FILES = os.listdir(task_events_root) + ALL_EVENTS_FILES = [ + file_i + for file_i in ALL_EVENTS_FILES + if (f"{subj}_" in file_i) + and (f"_{task}_" in file_i) + and ("events.tsv" in file_i) + ] + if not run is None: + ALL_EVENTS_FILES = [ + file_i for file_i in ALL_EVENTS_FILES if f"_{run}_" in file_i + ] + if not session is None: ALL_EVENTS_FILES = [ + file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i + ] + + if not len(ALL_EVENTS_FILES) == 1: + # in some cases the event file is common for all subjects and can be found in f"{main_root}/bids" + ALL_EVENTS_FILES_COMMON = os.listdir(f"{main_root}/bids/") + ALL_EVENTS_FILES_COMMON = [ file_i - for file_i in ALL_EVENTS_FILES - if (f"{subj}_" in file_i) - and (f"_{task}_" in file_i) - and ("events.tsv" in file_i) + for file_i in ALL_EVENTS_FILES_COMMON + if (f"{task}_" in file_i) and ("events.tsv" in file_i) ] - if not run is None: - ALL_EVENTS_FILES = [ - file_i for file_i in ALL_EVENTS_FILES if f"_{run}_" in file_i - ] - if not session is None: - ALL_EVENTS_FILES = [ - file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i - ] - if not len(ALL_EVENTS_FILES) == 1: + if len(ALL_EVENTS_FILES_COMMON) == 1: + events_file = f"{main_root}/bids/{ALL_EVENTS_FILES_COMMON[0]}" + else: # if the events file is not found, exclude the subject if run is None: print(f"Events file not found for {subj} {session_str} {task}") else: print(f"Events file not found for {subj} {session_str} {task} {run}") return - # load the tsv events file + else: events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}" - events = np.genfromtxt(events_file, delimiter="\t", dtype=str) - # get the event labels - event_labels, Fs_task, event_types = task_utils.events_time_to_labels( - events=events, - TR_mri=TR_mri, - num_time_mri=num_time_mri, - event_types=None, - oversampling=oversampling, - return_0_1=False, - ) - # fill task labels with task's index - task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index( - task - ) + + # load the tsv events file + events = np.genfromtxt(events_file, delimiter="\t", dtype=str) + # get the event labels + event_labels, Fs_task, event_types = task_utils.events_time_to_labels( + events=events, + TR_mri=TR_mri, + num_time_mri=num_time_mri, + event_types=None, + oversampling=oversampling, + return_0_1=False, + ) + # fill task labels with task's index + task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(task) ################################# SAVE ################################# # save the ROI time series and task data task_data = { From 007c81fb7bf43befcee3a343b03a5399ea01896b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 18:25:47 -0400 Subject: [PATCH 097/401] handle events files with diff trial type and rest labels --- pydfc/task_utils.py | 54 ++++++++++++++++---------- task_dFC/nifti_to_roi_signal.py | 9 +++++ task_dFC/run_scripts/dataset_info.json | 2 + 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index a807da1..4dedc52 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -19,19 +19,29 @@ def events_time_to_labels( - events, TR_mri, num_time_mri, event_types=None, oversampling=50, return_0_1=False + events, + TR_mri, + num_time_mri, + event_types=None, + oversampling=50, + trial_type_label="trial_type", + rest_labels=["rest", "Rest"], + return_0_1=False, ): """ event_types is a list of event types to be considered. If None, it will found based on events. Assigns the longest event in each TR to that TR (in the interval from last TR to current TR). It assumes that the first time point is TR0 which corresponds to [0 sec, TR sec] interval. oversampling: number of samples per TR_mri to improve the time resolution of tasks + + if trial_type_label is None, we use event type "unknown" as the trial type """ # find which column is the "onset" in the first row onset_idx = np.where(events[0, :] == "onset")[0][0] duration_idx = np.where(events[0, :] == "duration")[0][0] - trial_type_idx = np.where(events[0, :] == "trial_type")[0][0] + if trial_type_label is not None: + trial_type_idx = np.where(events[0, :] == trial_type_label)[0][0] assert ( events[0, onset_idx] == "onset" @@ -39,19 +49,21 @@ def events_time_to_labels( assert ( events[0, duration_idx] == "duration" ), "Something went wrong with the events file! The duration column was not found!" - assert ( - events[0, trial_type_idx] == "trial_type" - ), "Something went wrong with the events file! The trial_type column was not found!" + if trial_type_label is not None: + assert ( + events[0, trial_type_idx] == trial_type_label + ), "Something went wrong with the events file! The trial_type column was not found!" if event_types is None: - event_types = list(np.unique(events[1:, trial_type_idx])) - # if rest is already there, remove it - if "rest" in event_types: - warnings.warn("rest is already in the event types") - event_types.remove("rest") - if "Rest" in event_types: - warnings.warn("Rest is already in the event types") - event_types.remove("Rest") + if trial_type_label is None: + event_types = ["unknown"] + else: + event_types = list(np.unique(events[1:, trial_type_idx])) + # remove all the rest labels + for rest_label in rest_labels: + if rest_label in event_types: + event_types.remove(rest_label) + # add the rest label to the beginning for consistency event_types = ["rest"] + event_types Fs = float(1 / TR_mri) * oversampling @@ -62,18 +74,20 @@ def events_time_to_labels( if i == 0: continue - if events[i, trial_type_idx] in event_types: - if ("rest" in events[i, trial_type_idx]) or ( - "Rest" in events[i, trial_type_idx] - ): + if trial_type_label is None: + trial_type = "unknown" + else: + trial_type = events[i, trial_type_idx] + + if trial_type in event_types: + # the only rest label that is left in event types is "rest" but we don't want to consider it + if trial_type == "rest": continue start_time = float(events[i, onset_idx]) end_time = float(events[i, onset_idx]) + float(events[i, duration_idx]) start_timepoint = int(np.rint(start_time * Fs)) end_timepoint = int(np.rint(end_time * Fs)) - event_labels[start_timepoint:end_timepoint] = event_types.index( - events[i, trial_type_idx] - ) + event_labels[start_timepoint:end_timepoint] = event_types.index(trial_type) if return_0_1: event_labels = np.multiply(event_labels != 0, 1) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 46c0c66..3953865 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -21,6 +21,8 @@ def run_roi_signal_extraction( output_root, session=None, RUNS=[None], + trial_type_label="trial_type", + rest_labels=[], ): """ Extract ROI signals and task labels for a given subject and task @@ -131,6 +133,8 @@ def run_roi_signal_extraction( num_time_mri=num_time_mri, event_types=None, oversampling=oversampling, + trial_type_label=trial_type_label, + rest_labels=rest_labels, return_0_1=False, ) # fill task labels with task's index @@ -226,6 +230,9 @@ def run_roi_signal_extraction( else: output_root = dataset_info["roi_root"] + trial_type_label = dataset_info["trial_type_label"] + rest_labels = dataset_info["rest_labels"] + for session in SESSIONS: for task in TASKS: run_roi_signal_extraction( @@ -237,6 +244,8 @@ def run_roi_signal_extraction( output_root=output_root, session=session, RUNS=RUNS[task], + trial_type_label=trial_type_label, + rest_labels=rest_labels, ) print( diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts/dataset_info.json index 8296d5b..16d775e 100644 --- a/task_dFC/run_scripts/dataset_info.json +++ b/task_dFC/run_scripts/dataset_info.json @@ -7,6 +7,8 @@ "dFC_root" : "{main_root}/derivatives/dFC_assessed", "ML_root" : "{main_root}/derivatives/ML", "reports_root" : "{main_root}/derivatives/reports", + "trial_type_label" : "trial_type", + "rest_labels" : ["rest", "Rest"], "bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz", "SESSIONS" : [ "ses-1" From da75c996435b900df3bdce697afa7bc2948d1d47 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 23:35:35 -0400 Subject: [PATCH 098/401] add SI to report --- task_dFC/generate_report.py | 551 ++++++++++++++++++++++-------------- 1 file changed, 336 insertions(+), 215 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 2002bf6..4d1bdae 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -591,6 +591,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): if run is not None: dataframe = dataframe[dataframe["run"] == run] + # plot ARI score plt.figure(figsize=(10, 5)) g = sns.pointplot( data=dataframe[dataframe["task"] == task], @@ -616,7 +617,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): if run is None: plt.savefig( - f"{output_dir}/clustering_results_{task}.{save_fig_format}", + f"{output_dir}/clustering_results_ARI_{task}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -624,7 +625,49 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): ) else: plt.savefig( - f"{output_dir}/clustering_results_{task}_{run}.{save_fig_format}", + f"{output_dir}/clustering_results_ARI_{task}_{run}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + + plt.close() + + # plot SI score + plt.figure(figsize=(10, 5)) + g = sns.pointplot( + data=dataframe[dataframe["task"] == task], + x="dFC method", + y="SI", + errorbar="sd", + linestyle="none", + dodge=True, + capsize=0.1, + ) + + if show_title: + g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) + # save the figure + if session is None: + output_dir = f"{output_root}/group_results/clustering" + else: + output_dir = f"{output_root}/group_results/clustering/{session}" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if run is None: + plt.savefig( + f"{output_dir}/clustering_results_SI_{task}.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) + else: + plt.savefig( + f"{output_dir}/clustering_results_SI_{task}_{run}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -665,6 +708,7 @@ def plot_paradigm_clustering_score( paradigm_clustering_RESULTS = { "dFC method": [], "ARI score": [], + "SI score": [], } for result_file in ALL_PARADIGM_CLUSTERING_RESULTS: paradigm_clustering_RESULTS_new = np.load( @@ -676,6 +720,9 @@ def plot_paradigm_clustering_score( paradigm_clustering_RESULTS["ARI score"].append( paradigm_clustering_RESULTS_new["ARI"] ) + paradigm_clustering_RESULTS["SI score"].append( + paradigm_clustering_RESULTS_new["SI"] + ) sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) @@ -683,6 +730,7 @@ def plot_paradigm_clustering_score( dataframe = pd.DataFrame(paradigm_clustering_RESULTS) + # plot ARI score plt.figure(figsize=(10, 5)) g = sns.pointplot( data=dataframe, @@ -709,7 +757,7 @@ def plot_paradigm_clustering_score( os.makedirs(output_dir) plt.savefig( - f"{output_dir}/paradigm_clustering_results.{save_fig_format}", + f"{output_dir}/paradigm_clustering_results_ARI.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -718,155 +766,191 @@ def plot_paradigm_clustering_score( plt.close() + # plot SI score + plt.figure(figsize=(10, 5)) + g = sns.pointplot( + data=dataframe, + x="dFC method", + y="SI score", + linestyle="none", + dodge=True, + capsize=0.1, + ) -def plot_paradigm_clstr_centroids( - ML_root, - output_root, - session=None, -): - """ """ - # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy - # find all the paradigm_clustering_RESULTS files in the directory - if session is None: - input_dir = f"{ML_root}" - else: - input_dir = f"{ML_root}/{session}" + if show_title: + g.set_title( + "Task Paradigm Clustering Performance", + fontdict={"fontsize": 10, "fontweight": "bold"}, + ) + # save the figure if session is None: - output_dir = f"{output_root}/group_results/paradigm_clustering_centroids" + output_dir = f"{output_root}/group_results/paradigm_clustering" else: - output_dir = ( - f"{output_root}/group_results/paradigm_clustering_centroids/{session}" - ) + output_dir = f"{output_root}/group_results/paradigm_clustering/{session}" if not os.path.exists(output_dir): os.makedirs(output_dir) - ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir) - ALL_PARADIGM_CLUSTERING_RESULTS = [ - result_file - for result_file in ALL_PARADIGM_CLUSTERING_RESULTS - if "task_paradigm_clstr_RESULTS_" in result_file - ] - ALL_PARADIGM_CLUSTERING_RESULTS.sort() - - for result_file in ALL_PARADIGM_CLUSTERING_RESULTS: - paradigm_clustering_RESULTS_new = np.load( - f"{input_dir}/{result_file}", allow_pickle="TRUE" - ).item() - - measure_name = paradigm_clustering_RESULTS_new["dFC_method"] - centroids_mats = paradigm_clustering_RESULTS_new["centroids"] - - centroids_dict = {} - for i, centroid_mat in enumerate(centroids_mats): - centroids_dict[f"Cluster {i + 1}"] = centroid_mat - - visualize_conn_mat_dict( - data=centroids_dict, - title=f"Task Paradigm Centroids {measure_name}", - cmap="seismic", - normalize=True, - disp_diag=False, - save_image=True, - output_root=f"{output_dir}/", - center_0=True, - # node_networks=None, - ) - - -def plot_dFC_clustering( - dFC_root, - subj, - task, - start_time, - end_time, - output_root, - run=None, - session=None, - normalize_dFC=True, -): - task_data = load_task_data(roi_root, subj, task, run, session) - TR_mri = task_data["TR_mri"] - - for dFC_id in range( - 0, 20 - ): # change this to the number of dFCs you have or right a function that finds available dFC ids - try: - dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session) - except Exception: - pass - - dFC_mat = dFC.get_dFC_mat() - TR_array = dFC.TR_array - if normalize_dFC: - dFC_mat = rank_norm(dFC_mat) - dFC_vecs = dFC_mat2vec(dFC_mat) - - if session is None: - clustering_RESULTS = np.load( - f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - else: - clustering_RESULTS = np.load( - f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy", - allow_pickle="TRUE", - ).item() + plt.savefig( + f"{output_dir}/paradigm_clustering_results_SI.{save_fig_format}", + dpi=fig_dpi, + bbox_inches=fig_bbox_inches, + pad_inches=fig_pad, + format=save_fig_format, + ) - if run is None: - scaler = clustering_RESULTS[task]["StandardScaler"] - pca = clustering_RESULTS[task]["PCA"] - kmeans = clustering_RESULTS[task]["kmeans"] - else: - scaler = clustering_RESULTS[task][run]["StandardScaler"] - pca = clustering_RESULTS[task][run]["PCA"] - kmeans = clustering_RESULTS[task][run]["kmeans"] - - dFC_vecs_normalized = scaler.transform(dFC_vecs) - dFC_vecs_pca = pca.transform(dFC_vecs_normalized) - cluster_labels = kmeans.predict(dFC_vecs_pca) - - start_TR = int(start_time / TR_mri) - end_TR = int(end_time / TR_mri) - - start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0] - end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] - - fig_width = int(2.5 * (end_time - start_time) / 2) - fig_width = min(fig_width, 500) - plt.figure(figsize=(fig_width, 5)) - time = TR_array[start_TR_idx:end_TR_idx] * TR_mri - plt.plot( - time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4 - ) - # put vertical lines at the start of each TR - for t in time: - plt.axvline(x=t, color="r", linestyle="--") - # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center') - plt.title(f"Cluster labels of {dFC.measure.measure_name}") - plt.xlabel("Time (s)") + plt.close() - # save the figure - output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering" - if session is not None: - output_dir = f"{output_dir}/{session}" - output_dir = f"{output_dir}/{task}" - if run is not None: - output_dir = f"{output_dir}/{run}" - output_dir = f"{output_dir}/" - if not os.path.exists(output_dir): - os.makedirs(output_dir) +# def plot_paradigm_clstr_centroids( +# ML_root, +# output_root, +# session=None, +# ): +# """ """ +# # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy +# # find all the paradigm_clustering_RESULTS files in the directory +# if session is None: +# input_dir = f"{ML_root}" +# else: +# input_dir = f"{ML_root}/{session}" - plt.savefig( - f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}", - dpi=fig_dpi, - bbox_inches=fig_bbox_inches, - pad_inches=fig_pad, - format=save_fig_format, - ) +# if session is None: +# output_dir = f"{output_root}/group_results/paradigm_clustering_centroids" +# else: +# output_dir = ( +# f"{output_root}/group_results/paradigm_clustering_centroids/{session}" +# ) + +# if not os.path.exists(output_dir): +# os.makedirs(output_dir) + +# ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir) +# ALL_PARADIGM_CLUSTERING_RESULTS = [ +# result_file +# for result_file in ALL_PARADIGM_CLUSTERING_RESULTS +# if "task_paradigm_clstr_RESULTS_" in result_file +# ] +# ALL_PARADIGM_CLUSTERING_RESULTS.sort() + +# for result_file in ALL_PARADIGM_CLUSTERING_RESULTS: +# paradigm_clustering_RESULTS_new = np.load( +# f"{input_dir}/{result_file}", allow_pickle="TRUE" +# ).item() + +# measure_name = paradigm_clustering_RESULTS_new["dFC_method"] +# centroids_mats = paradigm_clustering_RESULTS_new["centroids"] + +# centroids_dict = {} +# for i, centroid_mat in enumerate(centroids_mats): +# centroids_dict[f"Cluster {i + 1}"] = centroid_mat + +# visualize_conn_mat_dict( +# data=centroids_dict, +# title=f"Task Paradigm Centroids {measure_name}", +# cmap="seismic", +# normalize=True, +# disp_diag=False, +# save_image=True, +# output_root=f"{output_dir}/", +# center_0=True, +# # node_networks=None, +# ) + + +# def plot_dFC_clustering( +# dFC_root, +# subj, +# task, +# start_time, +# end_time, +# output_root, +# run=None, +# session=None, +# normalize_dFC=True, +# ): +# task_data = load_task_data(roi_root, subj, task, run, session) +# TR_mri = task_data["TR_mri"] + +# for dFC_id in range( +# 0, 20 +# ): # change this to the number of dFCs you have or right a function that finds available dFC ids +# try: +# dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session) +# except Exception: +# pass + +# dFC_mat = dFC.get_dFC_mat() +# TR_array = dFC.TR_array +# if normalize_dFC: +# dFC_mat = rank_norm(dFC_mat) +# dFC_vecs = dFC_mat2vec(dFC_mat) + +# if session is None: +# clustering_RESULTS = np.load( +# f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE" +# ).item() +# else: +# clustering_RESULTS = np.load( +# f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy", +# allow_pickle="TRUE", +# ).item() - plt.close() +# if run is None: +# scaler = clustering_RESULTS[task]["StandardScaler"] +# pca = clustering_RESULTS[task]["PCA"] +# kmeans = clustering_RESULTS[task]["kmeans"] +# else: +# scaler = clustering_RESULTS[task][run]["StandardScaler"] +# pca = clustering_RESULTS[task][run]["PCA"] +# kmeans = clustering_RESULTS[task][run]["kmeans"] + +# dFC_vecs_normalized = scaler.transform(dFC_vecs) +# dFC_vecs_pca = pca.transform(dFC_vecs_normalized) +# cluster_labels = kmeans.predict(dFC_vecs_pca) + +# start_TR = int(start_time / TR_mri) +# end_TR = int(end_time / TR_mri) + +# start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0] +# end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1] + +# fig_width = int(2.5 * (end_time - start_time) / 2) +# fig_width = min(fig_width, 500) +# plt.figure(figsize=(fig_width, 5)) +# time = TR_array[start_TR_idx:end_TR_idx] * TR_mri +# plt.plot( +# time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4 +# ) +# # put vertical lines at the start of each TR +# for t in time: +# plt.axvline(x=t, color="r", linestyle="--") +# # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center') +# plt.title(f"Cluster labels of {dFC.measure.measure_name}") +# plt.xlabel("Time (s)") + +# # save the figure +# output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering" +# if session is not None: +# output_dir = f"{output_dir}/{session}" +# output_dir = f"{output_dir}/{task}" +# if run is not None: +# output_dir = f"{output_dir}/{run}" +# output_dir = f"{output_dir}/" + +# if not os.path.exists(output_dir): +# os.makedirs(output_dir) + +# plt.savefig( +# f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}", +# dpi=fig_dpi, +# bbox_inches=fig_bbox_inches, +# pad_inches=fig_pad, +# format=save_fig_format, +# ) + +# plt.close() def plot_task_presence_features( @@ -1042,28 +1126,28 @@ def create_html_report_subj_results( ) file.write("
\n") - # display dFC clustering - img_height = 100 - # for dFC matrices find all png files in the directory - dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}" - if os.path.exists(dFC_clustering_dir): - for file_name in os.listdir(dFC_clustering_dir): - if file_name.endswith(".png"): - file.write( - f"

{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}

\n" - ) - dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}" - # get the original size of the image - img = plt.imread(dFC_clustering_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".") - file.write( - f"{file_name}\n" - ) - file.write("
\n") + # # display dFC clustering + # img_height = 100 + # # for dFC matrices find all png files in the directory + # dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}" + # if os.path.exists(dFC_clustering_dir): + # for file_name in os.listdir(dFC_clustering_dir): + # if file_name.endswith(".png"): + # file.write( + # f"

{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}

\n" + # ) + # dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}" + # # get the original size of the image + # img = plt.imread(dFC_clustering_img) + # height, width, _ = img.shape + # # change the width so that height equals img_height + # width = int(width * img_height / height) + # # replace the path to the image with a relative path + # dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".") + # file.write( + # f"{file_name}\n" + # ) + # file.write("
\n") file.write("\n") file.write("\n") file.close() @@ -1189,12 +1273,31 @@ def create_html_report_group_results( else: clustering_dir = f"{group_dir}/clustering" - # display clustering results + # display clustering ARI results + if run is None: + clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}.png" + else: + clustering_img = ( + f"{clustering_dir}/clustering_results_ARI_{task}_{run}.png" + ) + img = plt.imread(clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + clustering_img = clustering_img.replace(group_dir, ".") + file.write( + f"Clustering results\n" + ) + + file.write("
\n") + + # display clustering SI results if run is None: - clustering_img = f"{clustering_dir}/clustering_results_{task}.png" + clustering_img = f"{clustering_dir}/clustering_results_SI_{task}.png" else: clustering_img = ( - f"{clustering_dir}/clustering_results_{task}_{run}.png" + f"{clustering_dir}/clustering_results_SI_{task}_{run}.png" ) img = plt.imread(clustering_img) height, width, _ = img.shape @@ -1218,11 +1321,11 @@ def create_html_report_group_results( else: paradigm_clustering_dir = f"{group_dir}/paradigm_clustering" - # display paradigm clustering scores + # display paradigm clustering ARI scores img_height = 300 - file.write("

Paradigm Clustering Scores

\n") + file.write("

Paradigm Clustering ARI Scores

\n") paradigm_clustering_img = ( - f"{paradigm_clustering_dir}/paradigm_clustering_results.png" + f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI.png" ) img = plt.imread(paradigm_clustering_img) height, width, _ = img.shape @@ -1236,36 +1339,54 @@ def create_html_report_group_results( file.write("
\n") - # display paradigm clustering centroids + # display paradigm clustering SI scores img_height = 300 - file.write("

Paradigm Clustering Centroids

\n") - # find all png files in the directory - paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids" - for file_name in os.listdir(paradigm_clustering_centroids_dir): - if file_name.endswith(".png"): - measure_name = file_name[ - file_name.find("Task_Paradigm_Centroids_") + 24 : -4 - ] - file.write(f"

{measure_name}

\n") - paradigm_clustering_centroids_img = ( - f"{paradigm_clustering_centroids_dir}/{file_name}" - ) - # get the original size of the image - img = plt.imread(paradigm_clustering_centroids_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - paradigm_clustering_centroids_img = ( - paradigm_clustering_centroids_img.replace(group_dir, ".") - ) - file.write( - f"Paradigm clustering centroids\n" - ) - file.write("
\n") + file.write("

Paradigm Clustering SI Scores

\n") + paradigm_clustering_img = ( + f"{paradigm_clustering_dir}/paradigm_clustering_results_SI.png" + ) + img = plt.imread(paradigm_clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".") + file.write( + f"Paradigm clustering results\n" + ) file.write("
\n") + # # display paradigm clustering centroids + # img_height = 300 + # file.write("

Paradigm Clustering Centroids

\n") + # # find all png files in the directory + # paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids" + # for file_name in os.listdir(paradigm_clustering_centroids_dir): + # if file_name.endswith(".png"): + # measure_name = file_name[ + # file_name.find("Task_Paradigm_Centroids_") + 24 : -4 + # ] + # file.write(f"

{measure_name}

\n") + # paradigm_clustering_centroids_img = ( + # f"{paradigm_clustering_centroids_dir}/{file_name}" + # ) + # # get the original size of the image + # img = plt.imread(paradigm_clustering_centroids_img) + # height, width, _ = img.shape + # # change the width so that height equals img_height + # width = int(width * img_height / height) + # # replace the path to the image with a relative path + # paradigm_clustering_centroids_img = ( + # paradigm_clustering_centroids_img.replace(group_dir, ".") + # ) + # file.write( + # f"Paradigm clustering centroids\n" + # ) + # file.write("
\n") + + # file.write("
\n") + file.write("\n") file.write("\n") file.close() @@ -1409,20 +1530,20 @@ def create_html_report_group_results( except Exception as e: print(f"Error in plotting task presence: {e}") - try: - plot_dFC_clustering( - dFC_root=dFC_root, - subj=subj, - task=task, - start_time=start_time, - end_time=end_time, - output_root=reports_root, - run=run, - session=session, - normalize_dFC=True, - ) - except Exception as e: - print(f"Error in plotting dFC clustering: {e}") + # try: + # plot_dFC_clustering( + # dFC_root=dFC_root, + # subj=subj, + # task=task, + # start_time=start_time, + # end_time=end_time, + # output_root=reports_root, + # run=run, + # session=session, + # normalize_dFC=True, + # ) + # except Exception as e: + # print(f"Error in plotting dFC clustering: {e}") # create html report try: create_html_report_subj_results( @@ -1470,14 +1591,14 @@ def create_html_report_group_results( except Exception as e: print(f"Error in plotting paradigm clustering scores: {e}") - try: - plot_paradigm_clstr_centroids( - ML_root=ML_root, - output_root=reports_root, - session=session, - ) - except Exception as e: - print(f"Error in plotting paradigm clustering centroids: {e}") + # try: + # plot_paradigm_clstr_centroids( + # ML_root=ML_root, + # output_root=reports_root, + # session=session, + # ) + # except Exception as e: + # print(f"Error in plotting paradigm clustering centroids: {e}") for task in TASKS: for run in RUNS[task]: From bf74c8b34fa56e1b91d3ee2bd99dcd75e8cc73fa Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Wed, 31 Jul 2024 23:58:02 -0400 Subject: [PATCH 099/401] minor change --- task_dFC/generate_report.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 4d1bdae..21bc05b 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -1218,14 +1218,14 @@ def create_html_report_group_results( classification_dir = f"{group_dir}/classification" # display Random Forest classification results - file.write("

Gradient Boosting

\n") + file.write("

KNN

\n") if run is None: classification_img = ( - f"{classification_dir}/ML_results_classify_GBT_{task}.png" + f"{classification_dir}/ML_results_classify_KNN_{task}.png" ) else: classification_img = ( - f"{classification_dir}/ML_results_classify_GBT_{task}_{run}.png" + f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png" ) img = plt.imread(classification_img) height, width, _ = img.shape @@ -1609,10 +1609,10 @@ def create_html_report_group_results( task=task, run=run, session=session, - ML_algorithm="Gradient Boosting", + ML_algorithm="KNN", ) except Exception as e: - print(f"Error in plotting ML results for GBT: {e}") + print(f"Error in plotting ML results for KNN: {e}") try: plot_ML_results( ML_root=ML_root, From ea220ba735d2135926a93d482dcd6b3c837fcf12 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 2 Aug 2024 15:31:13 -0400 Subject: [PATCH 100/401] concat+embed LE --- task_dFC/ML.py | 151 ++++++++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 66 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index eb35ccd..62c92d6 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -290,53 +290,88 @@ def embed_dFC_features( embedding="PCA", n_components=30, n_neighbors_LE=125, + LE_embedding_method="concat+embed", ): """ Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects. All the subjects are transformed into the space of the subject with the highest silhouette score. + + LE_embedding_method: "concat+embed" or "embed+procrustes" """ if embedding == "PCA": pca = PCA(n_components=n_components, svd_solver="full", whiten=False) pca.fit(X_train) X_train_embed = pca.transform(X_train) - X_test_embed = pca.transform(X_test) + if X_test is not None: + X_test_embed = pca.transform(X_test) + else: + X_test_embed = None elif embedding == "LE": - # first embed the dFC features of each subject into a lower dimensional space using LE separately - embed_dict = {} - for subject in train_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_train == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_train[subj_label_train == subject, :] - y_subj = y_train[subj_label_train == subject] - LE = SpectralEmbedding( - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - ) - X_subj_embed = LE.fit_transform(X_subj) - SI = silhouette_score(X_subj_embed, y_subj) - embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} - - # find the best transformation based on the SI score - best_SI = -1 - best_subject = None - for subject in embed_dict: - if embed_dict[subject]["SI"] > best_SI: - best_SI = embed_dict[subject]["SI"] - best_subject = subject - - # apply procrustes transformation to align the embeddings of different subjects - # use the embeddings of the subject with the highest SI score as the reference - X_train_embed = None - for subject in train_subjects: - X_subj_embed = embed_dict[subject]["X_subj_embed"] - # procrustes transformation - if subject == best_subject: - X_subj_embed_transformed = X_subj_embed - else: + if LE_embedding_method == "embed+procrustes": + # first embed the dFC features of each subject into a lower dimensional space using LE separately + embed_dict = {} + for subject in train_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_train == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_train[subj_label_train == subject, :] + y_subj = y_train[subj_label_train == subject] + LE = SpectralEmbedding( + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + ) + X_subj_embed = LE.fit_transform(X_subj) + SI = silhouette_score(X_subj_embed, y_subj) + embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} + + # find the best transformation based on the SI score + best_SI = -1 + best_subject = None + for subject in embed_dict: + if embed_dict[subject]["SI"] > best_SI: + best_SI = embed_dict[subject]["SI"] + best_subject = subject + + # apply procrustes transformation to align the embeddings of different subjects + # use the embeddings of the subject with the highest SI score as the reference + X_train_embed = None + for subject in train_subjects: + X_subj_embed = embed_dict[subject]["X_subj_embed"] + # procrustes transformation + if subject == best_subject: + X_subj_embed_transformed = X_subj_embed + else: + # for the procrustes transformation, the number of samples should be the same + X_best_subj_embed = precheck_for_procruste( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + _, X_subj_embed_transformed, _ = procrustes( + X_best_subj_embed, X_subj_embed + ) + if X_train_embed is None: + X_train_embed = X_subj_embed_transformed + else: + X_train_embed = np.concatenate( + (X_train_embed, X_subj_embed_transformed), axis=0 + ) + + # apply the same transformation to the test set + X_test_embed = None + for subject in test_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_test == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_test[subj_label_test == subject, :] + LE = SpectralEmbedding( + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + ) + X_subj_embed = LE.fit_transform(X_subj) + # procrustes transformation # for the procrustes transformation, the number of samples should be the same X_best_subj_embed = precheck_for_procruste( embed_dict[best_subject]["X_subj_embed"], X_subj_embed @@ -344,38 +379,19 @@ def embed_dFC_features( _, X_subj_embed_transformed, _ = procrustes( X_best_subj_embed, X_subj_embed ) - if X_train_embed is None: - X_train_embed = X_subj_embed_transformed - else: - X_train_embed = np.concatenate( - (X_train_embed, X_subj_embed_transformed), axis=0 - ) - - # apply the same transformation to the test set - X_test_embed = None - for subject in test_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_test == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_test[subj_label_test == subject, :] - LE = SpectralEmbedding( - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - ) - X_subj_embed = LE.fit_transform(X_subj) - # procrustes transformation - # for the procrustes transformation, the number of samples should be the same - X_best_subj_embed = precheck_for_procruste( - embed_dict[best_subject]["X_subj_embed"], X_subj_embed - ) - _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed) - if X_test_embed is None: - X_test_embed = X_subj_embed_transformed + if X_test_embed is None: + X_test_embed = X_subj_embed_transformed + else: + X_test_embed = np.concatenate( + (X_test_embed, X_subj_embed_transformed), axis=0 + ) + elif LE_embedding_method == "concat+embed": + LE = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors_LE) + X_train_embed = LE.fit_transform(X_train) + if X_test is not None: + X_test_embed = LE.transform(X_test) else: - X_test_embed = np.concatenate( - (X_test_embed, X_subj_embed_transformed), axis=0 - ) + X_test_embed = None return X_train_embed, X_test_embed @@ -698,6 +714,7 @@ def task_presence_classification( embedding="LE", n_components=30, n_neighbors_LE=125, + LE_embedding_method="concat+embed", ) # task presence classification @@ -831,6 +848,7 @@ def task_presence_clustering( embedding="LE", n_components=30, n_neighbors_LE=125, + LE_embedding_method="concat+embed", ) # clustering @@ -1083,6 +1101,7 @@ def task_paradigm_clustering( embedding="LE", n_components=30, n_neighbors_LE=125, + LE_embedding_method="concat+embed", ) # clustering From d63f7ec70b90f8f29f7c544e729fcbc1f0e12735 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 2 Aug 2024 16:40:33 -0400 Subject: [PATCH 101/401] minor fix --- task_dFC/ML.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 62c92d6..84472b6 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -386,10 +386,16 @@ def embed_dFC_features( (X_test_embed, X_subj_embed_transformed), axis=0 ) elif LE_embedding_method == "concat+embed": + # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data + if X_test is not None: + X_concat = np.concatenate((X_train, X_test), axis=0) + else: + X_concat = X_train LE = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors_LE) - X_train_embed = LE.fit_transform(X_train) + X_concat_embed = LE.fit_transform(X_concat) + X_train_embed = X_concat_embed[: X_train.shape[0], :] if X_test is not None: - X_test_embed = LE.transform(X_test) + X_test_embed = X_concat_embed[X_train.shape[0] :, :] else: X_test_embed = None From 1b5a439a4c99b29f79144ed4e68e23a616390c1f Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 6 Aug 2024 17:58:16 -0400 Subject: [PATCH 102/401] add generalized LE and corr distance for LE --- task_dFC/ML.py | 343 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 262 insertions(+), 81 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 84472b6..8c16cca 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -12,7 +12,7 @@ from sklearn.manifold import SpectralEmbedding from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV -from sklearn.neighbors import KNeighborsClassifier +from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler @@ -278,6 +278,245 @@ def precheck_for_procruste(X_best, X_subj): return X_best_new +def generalized_procrustes(X_list): + """ + Generalized Procrustes Analysis + + returns the mean X to be used as the reference for procrustes transformation + """ + # initialize Procrustes distance + current_distance = 0 + + # initialize a mean X + mean_X = np.array(X_list[0]) + + num_X = len(X_list) + + # create array for new Xs, add + new_Xs = np.zeros(np.array(X_list).shape) + + while True: + # add the mean X as first element of array + new_Xs[0] = mean_X + + # superimpose all shapes to current mean + for i in range(1, num_X): + _, new_X, _ = procrustes(mean_X, X_list[i]) + new_Xs[i] = new_X + + # calculate new mean + new_mean = np.mean(new_Xs, axis=0) + + _, _, new_distance = procrustes(new_mean, mean_X) + + # if the distance did not change, break the cycle + if np.abs(new_distance - current_distance) < 1e-6: + break + + # align the new_mean to old mean + _, new_mean, _ = procrustes(mean_X, new_mean) + + # update mean and distance + mean_X = new_mean + current_distance = new_distance + + return mean_X + + +def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): + """ + Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space. + """ + affinity_matrix = kneighbors_graph( + X, + n_neighbors=n_neighbors, + mode="connectivity", + include_self=False, + metric=distance_metric, + ) + affinity_matrix = affinity_matrix.toarray() + affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2) + LE = SpectralEmbedding( + n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors + ) + X_embed = LE.fit_transform(X=affinity_matrix) + return X_embed + + +def LE_embed_procustes( + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + train_subjects, + test_subjects, + n_components=30, + n_neighbors_LE=125, + procruste_method="best_SI", +): + if procruste_method == "best_SI": + # first embed the dFC features of each subject into a lower dimensional space using LE separately + embed_dict = {} + for subject in train_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_train == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_train[subj_label_train == subject, :] + y_subj = y_train[subj_label_train == subject] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + SI = silhouette_score(X_subj_embed, y_subj) + embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} + + # find the best transformation based on the SI score + best_SI = -1 + best_subject = None + for subject in embed_dict: + if embed_dict[subject]["SI"] > best_SI: + best_SI = embed_dict[subject]["SI"] + best_subject = subject + + # apply procrustes transformation to align the embeddings of different subjects + # use the embeddings of the subject with the highest SI score as the reference + X_train_embed = None + for subject in train_subjects: + X_subj_embed = embed_dict[subject]["X_subj_embed"] + # procrustes transformation + if subject == best_subject: + X_subj_embed_transformed = X_subj_embed + else: + # for the procrustes transformation, the number of samples should be the same + X_best_subj_embed = precheck_for_procruste( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + _, X_subj_embed_transformed, _ = procrustes( + X_best_subj_embed, X_subj_embed + ) + if X_train_embed is None: + X_train_embed = X_subj_embed_transformed + else: + X_train_embed = np.concatenate( + (X_train_embed, X_subj_embed_transformed), axis=0 + ) + + # apply the same transformation to the test set + X_test_embed = None + for subject in test_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_test == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_test[subj_label_test == subject, :] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + # procrustes transformation + # for the procrustes transformation, the number of samples should be the same + X_best_subj_embed = precheck_for_procruste( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed) + if X_test_embed is None: + X_test_embed = X_subj_embed_transformed + else: + X_test_embed = np.concatenate( + (X_test_embed, X_subj_embed_transformed), axis=0 + ) + + elif procruste_method == "generalized": + # in this method we use generalized procrustes analysis to align the embeddings of different subjects + # first embed the dFC features of each subject into a lower dimensional space using LE separately + embed_dict = {} + for subject in train_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_train == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_train[subj_label_train == subject, :] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + embed_dict[subject] = X_subj_embed + + # then find the max number of samples among all subjects + max_samples = 0 + for subject in train_subjects: + if embed_dict[subject].shape[0] > max_samples: + max_samples = embed_dict[subject].shape[0] + + # find the mean embedding of all subjects to use as the reference for procrustes transformation + X_train_list = [] + for subject in train_subjects: + X_subj_embed = embed_dict[subject] + # add zero rows to the embedding of the subject with less samples + if X_subj_embed.shape[0] < max_samples: + X_subj_embed_new = np.concatenate( + ( + X_subj_embed, + np.zeros( + ( + max_samples - X_subj_embed.shape[0], + X_subj_embed.shape[1], + ) + ), + ), + axis=0, + ) + else: + X_subj_embed_new = X_subj_embed + X_train_list.append(X_subj_embed_new) + mean_X_train = generalized_procrustes(X_train_list) + + X_train_embed = None + for subject in train_subjects: + X_subj_embed = embed_dict[subject] + mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) + _, X_subj_embed_transformed, _ = procrustes( + mean_X_train_new_size, X_subj_embed + ) + if X_train_embed is None: + X_train_embed = X_subj_embed_transformed + else: + X_train_embed = np.concatenate( + (X_train_embed, X_subj_embed_transformed), axis=0 + ) + + X_test_embed = None + for subject in test_subjects: + X_subj = X_test[subj_label_test == subject, :] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) + _, X_subj_embed_transformed, _ = procrustes( + mean_X_train_new_size, X_subj_embed + ) + if X_test_embed is None: + X_test_embed = X_subj_embed_transformed + else: + X_test_embed = np.concatenate( + (X_test_embed, X_subj_embed_transformed), axis=0 + ) + + return X_train_embed, X_test_embed + + def embed_dFC_features( train_subjects, test_subjects, @@ -310,89 +549,31 @@ def embed_dFC_features( X_test_embed = None elif embedding == "LE": if LE_embedding_method == "embed+procrustes": - # first embed the dFC features of each subject into a lower dimensional space using LE separately - embed_dict = {} - for subject in train_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_train == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_train[subj_label_train == subject, :] - y_subj = y_train[subj_label_train == subject] - LE = SpectralEmbedding( - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - ) - X_subj_embed = LE.fit_transform(X_subj) - SI = silhouette_score(X_subj_embed, y_subj) - embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} - - # find the best transformation based on the SI score - best_SI = -1 - best_subject = None - for subject in embed_dict: - if embed_dict[subject]["SI"] > best_SI: - best_SI = embed_dict[subject]["SI"] - best_subject = subject - - # apply procrustes transformation to align the embeddings of different subjects - # use the embeddings of the subject with the highest SI score as the reference - X_train_embed = None - for subject in train_subjects: - X_subj_embed = embed_dict[subject]["X_subj_embed"] - # procrustes transformation - if subject == best_subject: - X_subj_embed_transformed = X_subj_embed - else: - # for the procrustes transformation, the number of samples should be the same - X_best_subj_embed = precheck_for_procruste( - embed_dict[best_subject]["X_subj_embed"], X_subj_embed - ) - _, X_subj_embed_transformed, _ = procrustes( - X_best_subj_embed, X_subj_embed - ) - if X_train_embed is None: - X_train_embed = X_subj_embed_transformed - else: - X_train_embed = np.concatenate( - (X_train_embed, X_subj_embed_transformed), axis=0 - ) - - # apply the same transformation to the test set - X_test_embed = None - for subject in test_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_test == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_test[subj_label_test == subject, :] - LE = SpectralEmbedding( - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - ) - X_subj_embed = LE.fit_transform(X_subj) - # procrustes transformation - # for the procrustes transformation, the number of samples should be the same - X_best_subj_embed = precheck_for_procruste( - embed_dict[best_subject]["X_subj_embed"], X_subj_embed - ) - _, X_subj_embed_transformed, _ = procrustes( - X_best_subj_embed, X_subj_embed - ) - if X_test_embed is None: - X_test_embed = X_subj_embed_transformed - else: - X_test_embed = np.concatenate( - (X_test_embed, X_subj_embed_transformed), axis=0 - ) + X_train_embed, X_test_embed = LE_embed_procustes( + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test, + subj_label_train=subj_label_train, + subj_label_test=subj_label_test, + train_subjects=train_subjects, + test_subjects=test_subjects, + n_components=n_components, + n_neighbors_LE=n_neighbors_LE, + procruste_method="generalized", + ) elif LE_embedding_method == "concat+embed": # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data if X_test is not None: X_concat = np.concatenate((X_train, X_test), axis=0) else: X_concat = X_train - LE = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors_LE) - X_concat_embed = LE.fit_transform(X_concat) + X_concat_embed = LE_transform( + X=X_concat, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_concat.shape[0]), + distance_metric="correlation", + ) X_train_embed = X_concat_embed[: X_train.shape[0], :] if X_test is not None: X_test_embed = X_concat_embed[X_train.shape[0] :, :] @@ -720,7 +901,7 @@ def task_presence_classification( embedding="LE", n_components=30, n_neighbors_LE=125, - LE_embedding_method="concat+embed", + LE_embedding_method="embed+procrustes", ) # task presence classification @@ -854,7 +1035,7 @@ def task_presence_clustering( embedding="LE", n_components=30, n_neighbors_LE=125, - LE_embedding_method="concat+embed", + LE_embedding_method="embed+procrustes", ) # clustering @@ -1107,7 +1288,7 @@ def task_paradigm_clustering( embedding="LE", n_components=30, n_neighbors_LE=125, - LE_embedding_method="concat+embed", + LE_embedding_method="embed+procrustes", ) # clustering From 84de66bc582102faebae066fd3148f015d822747 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 8 Aug 2024 12:03:48 -0400 Subject: [PATCH 103/401] remove outdated scripts --- simul_dFC/FCS_estimate.py | 146 ------------ simul_dFC/KNN_ML.py | 460 ------------------------------------ simul_dFC/dFC_assessment.py | 102 -------- 3 files changed, 708 deletions(-) delete mode 100644 simul_dFC/FCS_estimate.py delete mode 100644 simul_dFC/KNN_ML.py delete mode 100644 simul_dFC/dFC_assessment.py diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py deleted file mode 100644 index 0fd7653..0000000 --- a/simul_dFC/FCS_estimate.py +++ /dev/null @@ -1,146 +0,0 @@ -import os -import time -import warnings - -import numpy as np - -from pydfc import MultiAnalysis, data_loader - -warnings.simplefilter("ignore") - -os.environ["MKL_NUM_THREADS"] = "16" -os.environ["NUMEXPR_NUM_THREADS"] = "16" -os.environ["OMP_NUM_THREADS"] = "16" - -################################# Parameters ################################# -# data paths -dataset = "ds000002" -# main_root = f"./DATA/{dataset}" # for local -main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}" # for server -roi_root = f"{main_root}/derivatives/ROI_timeseries" -output_root = f"{main_root}/derivatives/fitted_MEASURES" - -TASKS = [ - "task-midFreqMidRest", - "task-lowFreqLongRest", - "task-lowFreqShortRest", - "task-lowFreqShortTask", - "task-highFreqLongRest", - "task-highFreqShortRest", - "task-midFreqMidRestNoisy", -] - -job_id = int(os.getenv("SGE_TASK_ID")) -TASK_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 -if TASK_id >= len(TASKS): - print("TASK_id out of TASKS") - exit() -task = TASKS[TASK_id] - -###### MEASUREMENT PARAMETERS ###### - -# W is in sec - -params_methods = { - # Sliding Parameters - "W": 12, - "n_overlap": 1.0, - "sw_method": "pear_corr", - "tapered_window": True, - # TIME_FREQ - "TF_method": "WTC", - # CLUSTERING AND DHMM - "clstr_base_measure": "SlidingWindow", - # HMM - "hmm_iter": 20, - "dhmm_obs_state_ratio": 16 / 24, - # State Parameters - "n_states": 5, - "n_subj_clstrs": 10, - # Parallelization Parameters - "n_jobs": 2, - "verbose": 0, - "backend": "loky", - # SESSION - "session": task, - # Hyper Parameters - "normalization": True, - "num_subj": None, - "num_time_point": None, -} - -###### HYPER PARAMETERS ALTERNATIVE ###### - -MEASURES_name_lst = [ - "SlidingWindow", - "Time-Freq", - "CAP", - "ContinuousHMM", - "Windowless", - "Clustering", - "DiscreteHMM", -] - -alter_hparams = { - # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'], - # 'n_overlap': [0, 0.25, 0.75, 1], - # 'n_states': [6, 16], - # # 'normalization': [], - # 'num_subj': [50, 100, 200], - # 'num_select_nodes': [30, 50, 333], - # 'num_time_point': [800, 1000], - # 'Fs_ratio': [0.50, 0.75, 1.5], - # 'noise_ratio': [1.00, 2.00, 3.00], - # 'num_realization': [] -} - -###### MultiAnalysis PARAMETERS ###### - -params_multi_analysis = { - # Parallelization Parameters - "n_jobs": None, - "verbose": 0, - "backend": "loky", -} - -################################# LOAD DATA ################################# - -BOLD = data_loader.load_TS( - data_root=roi_root, - file_name="{subj_id}_{task}_time-series.npy", - SESSIONs=task, - subj_id2load=None, - task=task, -) -################################ Measures of dFC ################################# - -MA = MultiAnalysis( - analysis_name=f"simulated-task-based-dFC-{dataset}-{task}", **params_multi_analysis -) - -MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams) - -tic = time.time() -print("Measurement Started ...") - -################################# estimate FCS ################################# - -for MEASURE_id, measure in enumerate(MEASURES_lst): - - print("MEASURE: " + measure.measure_name) - print("FCS estimation started...") - - if measure.is_state_based: - measure.estimate_FCS(time_series=BOLD) - - print("FCS estimation done.") - - # Save - if not os.path.exists(f"{output_root}"): - os.makedirs(f"{output_root}") - np.save(f"{output_root}/MEASURE_{task}_{MEASURE_id}.npy", measure) - -print(f"Measurement required {time.time() - tic:0.3f} seconds.") -np.save(f"{output_root}/multi-analysis_{task}.npy", MA) - -################################################################################# diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py deleted file mode 100644 index c1b60cc..0000000 --- a/simul_dFC/KNN_ML.py +++ /dev/null @@ -1,460 +0,0 @@ -import argparse -import json -import os - -import numpy as np -from sklearn.decomposition import PCA -from sklearn.metrics import balanced_accuracy_score -from sklearn.model_selection import GridSearchCV -from sklearn.neighbors import KNeighborsClassifier -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler - -from pydfc import DFC, data_loader, task_utils -from pydfc.dfc_utils import dFC_mat2vec, rank_norm - -####################################################################################### - - -def find_available_subjects(dFC_root, task, dFC_id=None): - """ - Find the subjects that have dFC results for the given task and dFC_id (method). - """ - SUBJECTS = list() - ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/") - ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder] - ALL_SUBJ_FOLDERS.sort() - for subj_folder in ALL_SUBJ_FOLDERS: - ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") - ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file] - if dFC_id is not None: - ALL_DFC_FILES = [ - dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file - ] - ALL_DFC_FILES.sort() - if len(ALL_DFC_FILES) > 0: - SUBJECTS.append(subj_folder) - return SUBJECTS - - -def extract_task_features(TASKS, roi_root, output_root): - """ - Extract task features from the event data.""" - task_features = { - "task": list(), - "relative_task_on": list(), - "avg_task_duration": list(), - "var_task_duration": list(), - "avg_rest_duration": list(), - "var_rest_duration": list(), - "num_of_transitions": list(), - "relative_transition_freq": list(), - } - for task_id, task in enumerate(TASKS): - - if task == "task-restingstate": - continue - - SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task) - - for subj in SUBJECTS: - # event data - task_data = np.load( - f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - Fs_task = task_data["Fs_task"] - TR_task = 1 / Fs_task - - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=TR_task, - TR_mri=task_data["TR_mri"], - binary=True, - ) - - relative_task_on = task_utils.relative_task_on(task_presence) - # task duration - avg_task_duration, var_task_duration = task_utils.task_duration( - task_presence, task_data["TR_mri"] - ) - # rest duration - avg_rest_duration, var_rest_duration = task_utils.rest_duration( - task_presence, task_data["TR_mri"] - ) - # freq of transitions - num_of_transitions, relative_transition_freq = task_utils.transition_freq( - task_presence - ) - - task_features["task"].append(task) - task_features["relative_task_on"].append(relative_task_on) - task_features["avg_task_duration"].append(avg_task_duration) - task_features["var_task_duration"].append(var_task_duration) - task_features["avg_rest_duration"].append(avg_rest_duration) - task_features["var_rest_duration"].append(var_rest_duration) - task_features["num_of_transitions"].append(num_of_transitions) - task_features["relative_transition_freq"].append(relative_transition_freq) - - folder = f"{output_root}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/task_features_KNN_classify.npy", task_features) - - -def dFC_feature_extraction_subj_lvl( - dFC, - task_data, - dynamic_pred="no", - normalize_dFC=True, -): - """ - Extract features and target for task presence classification - for a single subject. - """ - # dFC features - dFC_mat = dFC.get_dFC_mat() - TR_array = dFC.TR_array - if normalize_dFC: - dFC_mat = rank_norm(dFC_mat) - dFC_vecs = dFC_mat2vec(dFC_mat) - - # event data - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=1 / task_data["Fs_task"], - TR_mri=task_data["TR_mri"], - TR_array=TR_array, - binary=True, - ) - - features = dFC_vecs - target = task_presence.ravel() - - if dynamic_pred == "past": - # concat current TR and two TR before of features to predict the current TR of target - # ignore the edge case of the first two TRs - features = np.concatenate( - (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1 - ) - features = features[2:, :] - target = target[2:] - elif dynamic_pred == "past_and_future": - # concat current TR and two TR before and after of features to predict the current TR of target - # ignore the edge case of the first and last two TRs - features = np.concatenate( - ( - features, - np.roll(features, 1, axis=0), - np.roll(features, 2, axis=0), - np.roll(features, -1, axis=0), - np.roll(features, -2, axis=0), - ), - axis=1, - ) - features = features[2:-2, :] - target = target[2:-2] - - return features, target - - -def dFC_feature_extraction( - task, - train_subjects, - test_subjects, - dFC_id, - roi_root, - dFC_root, - dynamic_pred="no", - normalize_dFC=True, -): - """ - Extract features and target for task presence classification - for all subjects. - """ - X_train = None - y_train = None - subj_label_train = list() - for subj in train_subjects: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - - task_data = np.load( - f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - - X_subj, y_subj = dFC_feature_extraction_subj_lvl( - dFC=dFC, - task_data=task_data, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - - subj_label_train.extend([subj for i in range(X_subj.shape[0])]) - if X_train is None and y_train is None: - X_train = X_subj - y_train = y_subj - else: - X_train = np.concatenate((X_train, X_subj), axis=0) - y_train = np.concatenate((y_train, y_subj), axis=0) - - X_test = None - y_test = None - subj_label_test = list() - for subj in test_subjects: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - - task_data = np.load( - f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - - X_subj, y_subj = dFC_feature_extraction_subj_lvl( - dFC=dFC, - task_data=task_data, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - - subj_label_test.extend([subj for i in range(X_subj.shape[0])]) - if X_test is None and y_test is None: - X_test = X_subj - y_test = y_subj - else: - X_test = np.concatenate((X_test, X_subj), axis=0) - y_test = np.concatenate((y_test, y_subj), axis=0) - - print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) - subj_label_train = np.array(subj_label_train) - subj_label_test = np.array(subj_label_test) - - return ( - X_train, - X_test, - y_train, - y_test, - subj_label_train, - subj_label_test, - dFC.measure.measure_name, - ) - - -def task_presence_classification( - task, - dFC_id, - roi_root, - dFC_root, - dynamic_pred="no", - normalize_dFC=True, - train_test_ratio=0.8, - explained_var_threshold=0.95, -): - print(f"=============== {task} ===============") - - if task == "task-restingstate": - return - - SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id) - - # randomly select train_test_ratio of the subjects for training - # and rest for testing using numpy.random.choice - train_subjects = np.random.choice( - SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False - ) - test_subjects = np.setdiff1d(SUBJECTS, train_subjects) - print( - f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}" - ) - - X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = ( - dFC_feature_extraction( - task=task, - train_subjects=train_subjects, - test_subjects=test_subjects, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - ) - - # task presence classification - - print("task presence classification ...") - - # find num_PCs - pca = PCA(svd_solver="full", whiten=False) - pca.fit(X_train) - num_PCs = ( - np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0] - + 1 - ) - - # create a pipeline with a knn model to find the best n_neighbors - knn = make_pipeline( - StandardScaler(), - PCA(n_components=num_PCs), - KNeighborsClassifier(), - ) - # create a dictionary of all values we want to test for n_neighbors - param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} - # use gridsearch to test all values for n_neighbors - knn_gscv = GridSearchCV(knn, param_grid, cv=5) - # fit model to data - knn_gscv.fit(X_train, y_train) - - n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] - - neigh = make_pipeline( - StandardScaler(), - PCA(n_components=num_PCs), - KNeighborsClassifier(n_neighbors=n_neighbors), - ).fit(X_train, y_train) - - ML_RESULT = { - "pca": pca, - "num_PCs": num_PCs, - "cv_results": knn_gscv.cv_results_, - "KNN": neigh, - "KNN train score": neigh.score(X_train, y_train), - "KNN test score": neigh.score(X_test, y_test), - } - - print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}") - print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}") - - # measure pred score on each subj - - ML_scores = { - "subj_id": list(), - "group": list(), - "task": list(), - "dFC method": list(), - "KNN accuracy": list(), - } - for subj in SUBJECTS: - ML_scores["subj_id"].append(subj) - if subj in train_subjects: - ML_scores["group"].append("train") - features = X_train[subj_label_train == subj, :] - target = y_train[subj_label_train == subj] - elif subj in test_subjects: - ML_scores["group"].append("test") - features = X_test[subj_label_test == subj, :] - target = y_test[subj_label_test == subj] - - pred = neigh.predict(features) - - ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred)) - - ML_scores["task"].append(task) - ML_scores["dFC method"].append(measure_name) - - return ML_RESULT, ML_scores - - -def run_classification( - TASKS, - roi_root, - dFC_root, - output_root, - dynamic_pred="no", - normalize_dFC=True, -): - ML_scores = { - "subj_id": list(), - "group": list(), - "task": list(), - "dFC method": list(), - "KNN accuracy": list(), - } - for dFC_id in range(0, 7): - print(f"=================== dFC {dFC_id} ===================") - - ML_RESULT = {} - for task_id, task in enumerate(TASKS): - ML_RESULT_new, ML_scores_new = task_presence_classification( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - ML_RESULT[task] = ML_RESULT_new - for key in ML_scores: - ML_scores[key].extend(ML_scores_new[key]) - - folder = f"{output_root}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) - - np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores) - - -####################################################################################### - -if __name__ == "__main__": - # argparse - HELPTEXT = """ - Script to apply Machine Learning on dFC results to predict task presence. - """ - - parser = argparse.ArgumentParser(description=HELPTEXT) - - parser.add_argument("--dataset_info", type=str, help="path to dataset info file") - - args = parser.parse_args() - - dataset_info_file = args.dataset_info - - # Read global configs - with open(dataset_info_file, "r") as f: - dataset_info = json.load(f) - - print("Task presence prediction started ...") - - TASKS = dataset_info["TASKS"] - - if "{dataset}" in dataset_info["main_root"]: - main_root = dataset_info["main_root"].replace( - "{dataset}", dataset_info["dataset"] - ) - else: - main_root = dataset_info["main_root"] - - if "{main_root}" in dataset_info["roi_root"]: - roi_root = dataset_info["roi_root"].replace("{main_root}", main_root) - else: - roi_root = dataset_info["roi_root"] - - if "{main_root}" in dataset_info["dFC_root"]: - dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root) - else: - dFC_root = dataset_info["dFC_root"] - - if "{main_root}" in dataset_info["ML_root"]: - ML_root = dataset_info["ML_root"].replace("{main_root}", main_root) - else: - ML_root = dataset_info["ML_root"] - - extract_task_features( - TASKS=TASKS, - roi_root=roi_root, - output_root=ML_root, - ) - run_classification( - TASKS=TASKS, - roi_root=roi_root, - dFC_root=dFC_root, - output_root=ML_root, - dynamic_pred="no", - normalize_dFC=True, - ) - - print("Task presence prediction CODE finished running.") - -####################################################################################### diff --git a/simul_dFC/dFC_assessment.py b/simul_dFC/dFC_assessment.py deleted file mode 100644 index d140bd6..0000000 --- a/simul_dFC/dFC_assessment.py +++ /dev/null @@ -1,102 +0,0 @@ -import os -import time -import warnings - -import numpy as np - -from pydfc import MultiAnalysis, data_loader - -warnings.simplefilter("ignore") - -os.environ["MKL_NUM_THREADS"] = "16" -os.environ["NUMEXPR_NUM_THREADS"] = "16" -os.environ["OMP_NUM_THREADS"] = "16" - -################################# Parameters ################################# - -# Data parameters -dataset = "ds000001" -# main_root = f"./DATA/{dataset}" # for local -main_root = f"../../DATA/task-based/simulated/{dataset}" # for server - -# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate -# you can set the new roi root and data load parameters here: -roi_root = f"{main_root}/derivatives/ROI_timeseries" -fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES" -output_root = f"{main_root}/derivatives/dFC_assessed" - -# for consistency we use 0 for resting state. will this cause a problem here?? -TASKS = ["task-pulse"] - -# find all subjects across all tasks -SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS) - -# job_id selects the subject -job_id = int(os.getenv("SGE_TASK_ID")) -if job_id > len(SUBJECTS): - print("job_id > len(SUBJECTS)") - exit() -subj_id = SUBJECTS[job_id - 1] # SGE_TASK_ID starts from 1 not 0 - -for task in TASKS: - - MA = np.load( - f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE" - ).item() - - # check if the subject has this task - SUBJECTS_with_this_task = data_loader.find_subj_list( - data_root=roi_root, sessions=[task] - ) - if not subj_id in SUBJECTS_with_this_task: - print(f"subject {subj_id} not in the list of subjects with task {task}") - continue - - ################################# LOAD FIT MEASURES ################################# - - ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/") - ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i] - ALL_RECORDS.sort() - MEASURES_fit_lst = list() - for s in ALL_RECORDS: - fit_measure = np.load( - f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE" - ).item() - MEASURES_fit_lst.append(fit_measure) - MA.set_MEASURES_fit_lst(MEASURES_fit_lst) - print("fitted MEASURES loaded ...") - - ################################# LOAD DATA ################################# - - print( - f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..." - ) - - BOLD = data_loader.load_TS( - data_root=roi_root, - file_name="time_series.npy", - SESSIONs=[task], - subj_id2load=subj_id, - ) - - ################################# dFC ASSESSMENT ################################# - - tic = time.time() - print("Measurement Started ...") - - print("dFC estimation started...") - dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD) - print("dFC estimation done.") - - print(f"Measurement required {time.time() - tic:0.3f} seconds.") - - ################################# SAVE DATA ################################# - - folder = f"{output_root}/{task}/{subj_id}" - if not os.path.exists(folder): - os.makedirs(folder) - - for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]): - np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC) - -####################################################################################### From b3061355373dfcd5d3f89192d93779f60c6bacd7 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 8 Aug 2024 12:06:33 -0400 Subject: [PATCH 104/401] add ml_utils --- pydfc/ml_utils.py | 1115 +++++++++++++++++++++++++++++++++++ pydfc/task_utils.py | 8 +- task_dFC/ML.py | 1096 +--------------------------------- task_dFC/generate_report.py | 8 +- 4 files changed, 1135 insertions(+), 1092 deletions(-) create mode 100644 pydfc/ml_utils.py diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py new file mode 100644 index 0000000..275753b --- /dev/null +++ b/pydfc/ml_utils.py @@ -0,0 +1,1115 @@ +# -*- coding: utf-8 -*- +""" +Functions to facilitate applying ML algorithms to dFC. + +Created on Aug 8 2024 +@author: Mohammad Torabi +""" +import os + +import numpy as np +from scipy.spatial import procrustes +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.manifold import SpectralEmbedding +from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV +from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler + +from .dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm +from .task_utils import ( + calc_relative_task_on, + calc_rest_duration, + calc_task_duration, + calc_transition_freq, + extract_task_presence, +) + +################################# Feature Loading Functions #################################### + + +def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None): + """ + Find the subjects that have dFC results for the given task and dFC_id (method). + """ + SUBJECTS = list() + ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/") + ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder] + ALL_SUBJ_FOLDERS.sort() + for subj_folder in ALL_SUBJ_FOLDERS: + if session is None: + ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") + else: + ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/") + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file + ] + if dFC_id is not None: + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file + ] + if run is not None: + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{run}_" in dFC_file + ] + if session is not None: + ALL_DFC_FILES = [ + dFC_file for dFC_file in ALL_DFC_FILES if f"_{session}_" in dFC_file + ] + ALL_DFC_FILES.sort() + if len(ALL_DFC_FILES) > 0: + SUBJECTS.append(subj_folder) + return SUBJECTS + + +def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None): + """ + Load the dFC results for a given subject, task, dFC_id, run and session. + """ + if session is None: + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" + ).item() + else: + if run is None: + dFC = np.load( + f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + else: + dFC = np.load( + f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy", + allow_pickle="TRUE", + ).item() + + return dFC + + +def load_task_data(roi_root, subj, task, run=None, session=None): + """ + Load the task data for a given subject, task and run. + """ + if session is None: + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + if run is None: + task_data = np.load( + f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy", + allow_pickle="TRUE", + ).item() + else: + task_data = np.load( + f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy", + allow_pickle="TRUE", + ).item() + + return task_data + + +################################# Feature Extraction Functions #################################### + + +def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, dFC_root, output_root): + """ + Extract task features from the event data.""" + for session in SESSIONS: + task_features = { + "task": list(), + "run": list(), + "relative_task_on": list(), + "avg_task_duration": list(), + "var_task_duration": list(), + "avg_rest_duration": list(), + "var_rest_duration": list(), + "num_of_transitions": list(), + "relative_transition_freq": list(), + } + for task_id, task in enumerate(TASKS): + + if task == "task-restingstate": + continue + + for run in RUNS[task]: + + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, session=session + ) + + for subj in SUBJECTS: + # event data + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + binarizing_method="mean", + ) + + relative_task_on = calc_relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = calc_task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = calc_rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = calc_transition_freq( + task_presence + ) + + task_features["task"].append(task) + task_features["run"].append(run) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append( + relative_transition_freq + ) + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + if not os.path.exists(folder): + os.makedirs(folder) + np.save(f"{folder}/task_features.npy", task_features) + + +def dFC_feature_extraction_subj_lvl( + dFC, + task_data, + dynamic_pred="no", + normalize_dFC=True, +): + """ + Extract features and target for task presence classification + for a single subject. + dynamic_pred: "no", "past", "past_and_future" + """ + # dFC features + dFC_mat = dFC.get_dFC_mat() + TR_array = dFC.TR_array + if normalize_dFC: + dFC_mat = rank_norm(dFC_mat) + dFC_vecs = dFC_mat2vec(dFC_mat) + + # event data + task_presence = extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=1 / task_data["Fs_task"], + TR_mri=task_data["TR_mri"], + TR_array=TR_array, + binary=True, + binarizing_method="mean", + ) + + features = dFC_vecs + target = task_presence.ravel() + + if dynamic_pred == "past": + # concat current TR and two TR before of features to predict the current TR of target + # ignore the edge case of the first two TRs + features = np.concatenate( + (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1 + ) + features = features[2:, :] + target = target[2:] + elif dynamic_pred == "past_and_future": + # concat current TR and two TR before and after of features to predict the current TR of target + # ignore the edge case of the first and last two TRs + features = np.concatenate( + ( + features, + np.roll(features, 1, axis=0), + np.roll(features, 2, axis=0), + np.roll(features, -1, axis=0), + np.roll(features, -2, axis=0), + ), + axis=1, + ) + features = features[2:-2, :] + target = target[2:-2] + + return features, target + + +def dFC_feature_extraction( + task, + train_subjects, + test_subjects, + dFC_id, + roi_root, + dFC_root, + run=None, + session=None, + dynamic_pred="no", + normalize_dFC=True, +): + """ + Extract features and target for task presence classification + for all subjects. + if run is specified, dFC results for that run will be used. + """ + dFC_measure_name = None + X_train = None + y_train = None + subj_label_train = list() + for subj in train_subjects: + + dFC = load_dFC( + dFC_root=dFC_root, + subj=subj, + task=task, + dFC_id=dFC_id, + run=run, + session=session, + ) + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) + + X_subj, y_subj = dFC_feature_extraction_subj_lvl( + dFC=dFC, + task_data=task_data, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + + subj_label_train.extend([subj for i in range(X_subj.shape[0])]) + if X_train is None and y_train is None: + X_train = X_subj + y_train = y_subj + else: + X_train = np.concatenate((X_train, X_subj), axis=0) + y_train = np.concatenate((y_train, y_subj), axis=0) + + if dFC_measure_name is None: + dFC_measure_name = dFC.measure.measure_name + else: + assert ( + dFC_measure_name == dFC.measure.measure_name + ), "dFC measure is not consistent." + + X_test = None + y_test = None + subj_label_test = list() + for subj in test_subjects: + dFC = load_dFC( + dFC_root=dFC_root, + subj=subj, + task=task, + dFC_id=dFC_id, + run=run, + session=session, + ) + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) + + X_subj, y_subj = dFC_feature_extraction_subj_lvl( + dFC=dFC, + task_data=task_data, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + + subj_label_test.extend([subj for i in range(X_subj.shape[0])]) + if X_test is None and y_test is None: + X_test = X_subj + y_test = y_subj + else: + X_test = np.concatenate((X_test, X_subj), axis=0) + y_test = np.concatenate((y_test, y_subj), axis=0) + + if dFC_measure_name is None: + dFC_measure_name = dFC.measure.measure_name + else: + assert ( + dFC_measure_name == dFC.measure.measure_name + ), "dFC measure is not consistent." + + # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) + subj_label_train = np.array(subj_label_train) + subj_label_test = np.array(subj_label_test) + + return ( + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + dFC_measure_name, + ) + + +################################# Feature Embedding Functions #################################### + + +def precheck_for_procruste(X_best, X_subj): + """ + Check if the two matrices have the same number of rows. if not, make them the same. + """ + # for the procrustes transformation, the number of samples should be the same + if X_subj.shape[0] > X_best.shape[0]: + # add zero rows to the embedding of the best subject + X_best_new = np.concatenate( + ( + X_best, + np.zeros( + ( + X_subj.shape[0] - X_best.shape[0], + X_best.shape[1], + ) + ), + ), + axis=0, + ) + elif X_subj.shape[0] < X_best.shape[0]: + # remove extra rows from the embedding of the best subject + X_best_new = X_best[: X_subj.shape[0], :] + else: + X_best_new = X_best + + X_best_new = X_best_new.copy() + + return X_best_new + + +def generalized_procrustes(X_list): + """ + Generalized Procrustes Analysis + + returns the mean X to be used as the reference for procrustes transformation + """ + # initialize Procrustes distance + current_distance = 0 + + # initialize a mean X + mean_X = np.array(X_list[0]) + + num_X = len(X_list) + + # create array for new Xs, add + new_Xs = np.zeros(np.array(X_list).shape) + + while True: + # add the mean X as first element of array + new_Xs[0] = mean_X + + # superimpose all shapes to current mean + for i in range(1, num_X): + _, new_X, _ = procrustes(mean_X, X_list[i]) + new_Xs[i] = new_X + + # calculate new mean + new_mean = np.mean(new_Xs, axis=0) + + _, _, new_distance = procrustes(new_mean, mean_X) + + # if the distance did not change, break the cycle + if np.abs(new_distance - current_distance) < 1e-6: + break + + # align the new_mean to old mean + _, new_mean, _ = procrustes(mean_X, new_mean) + + # update mean and distance + mean_X = new_mean + current_distance = new_distance + + return mean_X + + +def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): + """ + Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space. + """ + affinity_matrix = kneighbors_graph( + X, + n_neighbors=n_neighbors, + mode="connectivity", + include_self=False, + metric=distance_metric, + ) + affinity_matrix = affinity_matrix.toarray() + affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2) + LE = SpectralEmbedding( + n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors + ) + X_embed = LE.fit_transform(X=affinity_matrix) + return X_embed + + +def LE_embed_procustes( + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + train_subjects, + test_subjects, + n_components=30, + n_neighbors_LE=125, + procruste_method="best_SI", +): + if procruste_method == "best_SI": + # first embed the dFC features of each subject into a lower dimensional space using LE separately + embed_dict = {} + for subject in train_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_train == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_train[subj_label_train == subject, :] + y_subj = y_train[subj_label_train == subject] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + SI = silhouette_score(X_subj_embed, y_subj) + embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} + + # find the best transformation based on the SI score + best_SI = -1 + best_subject = None + for subject in embed_dict: + if embed_dict[subject]["SI"] > best_SI: + best_SI = embed_dict[subject]["SI"] + best_subject = subject + + # apply procrustes transformation to align the embeddings of different subjects + # use the embeddings of the subject with the highest SI score as the reference + X_train_embed = None + for subject in train_subjects: + X_subj_embed = embed_dict[subject]["X_subj_embed"] + # procrustes transformation + if subject == best_subject: + X_subj_embed_transformed = X_subj_embed + else: + # for the procrustes transformation, the number of samples should be the same + X_best_subj_embed = precheck_for_procruste( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + _, X_subj_embed_transformed, _ = procrustes( + X_best_subj_embed, X_subj_embed + ) + if X_train_embed is None: + X_train_embed = X_subj_embed_transformed + else: + X_train_embed = np.concatenate( + (X_train_embed, X_subj_embed_transformed), axis=0 + ) + + # apply the same transformation to the test set + X_test_embed = None + for subject in test_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_test == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_test[subj_label_test == subject, :] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + # procrustes transformation + # for the procrustes transformation, the number of samples should be the same + X_best_subj_embed = precheck_for_procruste( + embed_dict[best_subject]["X_subj_embed"], X_subj_embed + ) + _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed) + if X_test_embed is None: + X_test_embed = X_subj_embed_transformed + else: + X_test_embed = np.concatenate( + (X_test_embed, X_subj_embed_transformed), axis=0 + ) + + elif procruste_method == "generalized": + # in this method we use generalized procrustes analysis to align the embeddings of different subjects + # first embed the dFC features of each subject into a lower dimensional space using LE separately + embed_dict = {} + for subject in train_subjects: + # assert the samples of the same subject are contiguous + assert np.all( + np.diff(np.where(subj_label_train == subject)[0]) == 1 + ), f"Indices of {subject} are not consecutive" + X_subj = X_train[subj_label_train == subject, :] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + embed_dict[subject] = X_subj_embed + + # then find the max number of samples among all subjects + max_samples = 0 + for subject in train_subjects: + if embed_dict[subject].shape[0] > max_samples: + max_samples = embed_dict[subject].shape[0] + + # find the mean embedding of all subjects to use as the reference for procrustes transformation + X_train_list = [] + for subject in train_subjects: + X_subj_embed = embed_dict[subject] + # add zero rows to the embedding of the subject with less samples + if X_subj_embed.shape[0] < max_samples: + X_subj_embed_new = np.concatenate( + ( + X_subj_embed, + np.zeros( + ( + max_samples - X_subj_embed.shape[0], + X_subj_embed.shape[1], + ) + ), + ), + axis=0, + ) + else: + X_subj_embed_new = X_subj_embed + X_train_list.append(X_subj_embed_new) + mean_X_train = generalized_procrustes(X_train_list) + + X_train_embed = None + for subject in train_subjects: + X_subj_embed = embed_dict[subject] + mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) + _, X_subj_embed_transformed, _ = procrustes( + mean_X_train_new_size, X_subj_embed + ) + if X_train_embed is None: + X_train_embed = X_subj_embed_transformed + else: + X_train_embed = np.concatenate( + (X_train_embed, X_subj_embed_transformed), axis=0 + ) + + X_test_embed = None + for subject in test_subjects: + X_subj = X_test[subj_label_test == subject, :] + X_subj_embed = LE_transform( + X=X_subj, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + distance_metric="correlation", + ) + mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) + _, X_subj_embed_transformed, _ = procrustes( + mean_X_train_new_size, X_subj_embed + ) + if X_test_embed is None: + X_test_embed = X_subj_embed_transformed + else: + X_test_embed = np.concatenate( + (X_test_embed, X_subj_embed_transformed), axis=0 + ) + + return X_train_embed, X_test_embed + + +def embed_dFC_features( + train_subjects, + test_subjects, + X_train, + X_test, + y_train, + y_test, + subj_label_train, + subj_label_test, + embedding="PCA", + n_components=30, + n_neighbors_LE=125, + LE_embedding_method="concat+embed", +): + """ + Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. + + for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects. + All the subjects are transformed into the space of the subject with the highest silhouette score. + + LE_embedding_method: "concat+embed" or "embed+procrustes" + """ + if embedding == "PCA": + pca = PCA(n_components=n_components, svd_solver="full", whiten=False) + pca.fit(X_train) + X_train_embed = pca.transform(X_train) + if X_test is not None: + X_test_embed = pca.transform(X_test) + else: + X_test_embed = None + elif embedding == "LE": + if LE_embedding_method == "embed+procrustes": + X_train_embed, X_test_embed = LE_embed_procustes( + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test, + subj_label_train=subj_label_train, + subj_label_test=subj_label_test, + train_subjects=train_subjects, + test_subjects=test_subjects, + n_components=n_components, + n_neighbors_LE=n_neighbors_LE, + procruste_method="generalized", + ) + elif LE_embedding_method == "concat+embed": + # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data + if X_test is not None: + X_concat = np.concatenate((X_train, X_test), axis=0) + else: + X_concat = X_train + X_concat_embed = LE_transform( + X=X_concat, + n_components=n_components, + n_neighbors=min(n_neighbors_LE, X_concat.shape[0]), + distance_metric="correlation", + ) + X_train_embed = X_concat_embed[: X_train.shape[0], :] + if X_test is not None: + X_test_embed = X_concat_embed[X_train.shape[0] :, :] + else: + X_test_embed = None + + return X_train_embed, X_test_embed + + +################################# Classification Framework Functions #################################### + + +def logistic_regression_classify(X_train, y_train, X_test, y_test): + """ + Logistic regression classification + """ + # create a pipeline with a logistic regression model to find the best C + logistic_reg = make_pipeline( + StandardScaler(), LogisticRegression(penalty="l1", solver="saga") + ) + # create a dictionary of all values we want to test for C + param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} + # use gridsearch to test all values for C + lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5) + # fit model to data + lr_gscv.fit(X_train, y_train) + + C = lr_gscv.best_params_["logisticregression__C"] + + log_reg = make_pipeline( + StandardScaler(), + LogisticRegression(penalty="l1", C=C, solver="saga"), + ).fit(X_train, y_train) + + RESULT = { + "log_reg_model": log_reg, + "log_reg_C": C, + "log_reg_train_score": log_reg.score(X_train, y_train), + "log_reg_test_score": log_reg.score(X_test, y_test), + } + + return RESULT + + +def KNN_classify(X_train, y_train, X_test, y_test): + """ + KNN classification + """ + # create a pipeline with a knn model to find the best n_neighbors + knn = make_pipeline( + StandardScaler(), + KNeighborsClassifier(), + ) + # create a dictionary of all values we want to test for n_neighbors + param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} + # use gridsearch to test all values for n_neighbors + knn_gscv = GridSearchCV(knn, param_grid, cv=5) + # fit model to data + knn_gscv.fit(X_train, y_train) + + n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] + + neigh = make_pipeline( + StandardScaler(), + KNeighborsClassifier(n_neighbors=n_neighbors), + ).fit(X_train, y_train) + + RESULT = { + "KNN_cv_results": knn_gscv.cv_results_, + "KNN_model": neigh, + "KNN_train_score": neigh.score(X_train, y_train), + "KNN_test_score": neigh.score(X_test, y_test), + } + + return RESULT + + +def random_forest_classify(X_train, y_train, X_test, y_test): + """ + Random Forest classification + """ + # create a pipeline with a random forest model to find the best n_estimators + rf = make_pipeline( + StandardScaler(), + RandomForestClassifier(), + ) + # create a dictionary of all values we want to test for n_estimators + param_grid = { + "randomforestclassifier__n_estimators": [10, 50, 100, 200], + "randomforestclassifier__max_depth": [None, 5, 10, 20, 30], + } + # use gridsearch to test all values for n_estimators + rf_gscv = GridSearchCV(rf, param_grid, cv=5) + # fit model to data + rf_gscv.fit(X_train, y_train) + + n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"] + max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"] + + rf = make_pipeline( + StandardScaler(), + RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth), + ).fit(X_train, y_train) + + RESULT = { + "RF_cv_results": rf_gscv.cv_results_, + "RF_model": rf, + "RF_train_score": rf.score(X_train, y_train), + "RF_test_score": rf.score(X_test, y_test), + } + + return RESULT + + +def gradient_boosting_classify(X_train, y_train, X_test, y_test): + """ + Gradient Boosting classification + """ + # create a pipeline with a gradient boosting model to find the best n_estimators + gb = make_pipeline( + StandardScaler(), + GradientBoostingClassifier(), + ) + # create a dictionary of all values we want to test for n_estimators + param_grid = { + "gradientboostingclassifier__n_estimators": [10, 50, 100, 200], + "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2], + "gradientboostingclassifier__max_depth": [3, 5, 10], + } + # use gridsearch to test all values for n_estimators + gb_gscv = GridSearchCV(gb, param_grid, cv=5) + # fit model to data + gb_gscv.fit(X_train, y_train) + + n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"] + learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"] + max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"] + + gb = make_pipeline( + StandardScaler(), + GradientBoostingClassifier( + n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate + ), + ).fit(X_train, y_train) + + RESULT = { + "GB_cv_results": gb_gscv.cv_results_, + "GB_model": gb, + "GB_train_score": gb.score(X_train, y_train), + "GB_test_score": gb.score(X_test, y_test), + } + + return RESULT + + +def task_presence_classification( + task, + dFC_id, + roi_root, + dFC_root, + run=None, + session=None, + dynamic_pred="no", + normalize_dFC=True, + train_test_ratio=0.8, +): + """ + perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting + for a given task and dFC method and run. + """ + if run is None: + print(f"=============== {task} ===============") + else: + print(f"=============== {task} {run} ===============") + + if task == "task-restingstate": + return + + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id + ) + + # randomly select train_test_ratio of the subjects for training + # and rest for testing using numpy.random.choice + train_subjects = np.random.choice( + SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False + ) + test_subjects = np.setdiff1d(SUBJECTS, train_subjects) + print( + f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}" + ) + + X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = ( + dFC_feature_extraction( + task=task, + train_subjects=train_subjects, + test_subjects=test_subjects, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + ) + + # embed dFC features + X_train, X_test = embed_dFC_features( + train_subjects=train_subjects, + test_subjects=test_subjects, + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test, + subj_label_train=subj_label_train, + subj_label_test=subj_label_test, + embedding="LE", + n_components=30, + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + + # task presence classification + + print("task presence classification ...") + + # logistic regression + log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test) + + # KNN + KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test) + + # # Random Forest + # RF_RESULT = random_forest_classify( + # X_train, y_train, X_test, y_test + # ) + + # # Gradient Boosting + # GBT_RESULT = gradient_boosting_classify( + # X_train, y_train, X_test, y_test + # ) + + ML_RESULT = {} + for key in log_reg_RESULT: + ML_RESULT[key] = log_reg_RESULT[key] + for key in KNN_RESULT: + ML_RESULT[key] = KNN_RESULT[key] + # for key in RF_RESULT: + # ML_RESULT[key] = RF_RESULT[key] + # for key in GBT_RESULT: + # ML_RESULT[key] = GBT_RESULT[key] + + # measure pred score on each subj + + ML_scores = { + "subj_id": list(), + "group": list(), + "task": list(), + "run": list(), + "dFC method": list(), + "Logistic regression accuracy": list(), + "KNN accuracy": list(), + # "Random Forest accuracy": list(), + # "Gradient Boosting accuracy": list(), + } + log_reg = log_reg_RESULT["log_reg_model"] + KNN = KNN_RESULT["KNN_model"] + # RF = RF_RESULT["RF_model"] + # GBT = GBT_RESULT["GB_model"] + + for subj in SUBJECTS: + ML_scores["subj_id"].append(subj) + if subj in train_subjects: + ML_scores["group"].append("train") + features = X_train[subj_label_train == subj, :] + target = y_train[subj_label_train == subj] + elif subj in test_subjects: + ML_scores["group"].append("test") + features = X_test[subj_label_test == subj, :] + target = y_test[subj_label_test == subj] + + pred_lr = log_reg.predict(features) + pred_KNN = KNN.predict(features) + # pred_RF = RF.predict(features) + # pred_GBT = GBT.predict(features) + + ML_scores["Logistic regression accuracy"].append( + balanced_accuracy_score(target, pred_lr) + ) + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) + # ML_scores["Random Forest accuracy"].append( + # balanced_accuracy_score(target, pred_RF) + # ) + # ML_scores["Gradient Boosting accuracy"].append( + # balanced_accuracy_score(target, pred_GBT) + # ) + + ML_scores["task"].append(task) + ML_scores["run"].append(run) + ML_scores["dFC method"].append(measure_name) + + return ML_RESULT, ML_scores + + +################################# Clustering Framework Functions #################################### + + +def task_presence_clustering( + task, + dFC_id, + roi_root, + dFC_root, + run=None, + session=None, + normalize_dFC=True, +): + if run is None: + print(f"=============== {task} ===============") + else: + print(f"=============== {task} {run} ===============") + + if task == "task-restingstate": + return + + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id + ) + + print(f"Number of subjects: {len(SUBJECTS)}") + + X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction( + task=task, + train_subjects=SUBJECTS, + test_subjects=[], + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred="no", + normalize_dFC=normalize_dFC, + ) + + # embed dFC features + X, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding="LE", + n_components=30, + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + + # clustering + # apply kmeans clustering to dFC features + + n_clusters = 2 # corresponding to task and rest + + scaler = StandardScaler() + X_normalized = scaler.fit_transform(X) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) + labels_pred = kmeans.fit_predict(X_normalized) + + # ARI score + print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") + + # # visualize clustering centroids + # centroids = kmeans.cluster_centers_ + # centroids = pca.inverse_transform(centroids) + # centroids = scaler.inverse_transform(centroids) + # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + # centroids_mat = dFC_vec2mat(centroids, n_regions) + + clustering_RESULTS = { + "StandardScaler": scaler, + "kmeans": kmeans, + "ARI": adjusted_rand_score(y, labels_pred), + # "centroids": centroids_mat, + } + + clustering_scores = { + "subj_id": list(), + "task": list(), + "run": list(), + "dFC method": list(), + "Kmeans ARI": list(), + "SI": list(), + } + for subj in SUBJECTS: + clustering_scores["subj_id"].append(subj) + features = X[subj_label == subj, :] + target = y[subj_label == subj] + + features_normalized = scaler.transform(features) + pred_kmeans = kmeans.predict(features_normalized) + + clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans)) + + # silhouette score in terms of separability of original labels, not the clustering labels + clustering_scores["SI"].append(silhouette_score(features, target)) + + clustering_scores["task"].append(task) + clustering_scores["run"].append(run) + clustering_scores["dFC method"].append(measure_name) + + return clustering_RESULTS, clustering_scores diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index 4dedc52..8a11cbf 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -314,7 +314,7 @@ def extract_task_presence( ################################# Task Features #################################### -def relative_task_on(task_presence): +def calc_relative_task_on(task_presence): """ task_presence: 0, 1 array return: relative_task_on @@ -322,7 +322,7 @@ def relative_task_on(task_presence): return np.sum(task_presence) / len(task_presence) -def task_duration(task_presence, TR_mri): +def calc_task_duration(task_presence, TR_mri): """ task_presence: 0, 1 array return: avg_task_duration, var_task_duration @@ -339,7 +339,7 @@ def task_duration(task_presence, TR_mri): return np.mean(task_durations), np.var(task_durations) -def rest_duration(task_presence, TR_mri): +def calc_rest_duration(task_presence, TR_mri): """ task_presence: 0, 1 array return: avg_rest_duration, var_rest_duration @@ -361,7 +361,7 @@ def rest_duration(task_presence, TR_mri): return np.mean(rest_durations), np.var(rest_durations) -def transition_freq(task_presence): +def calc_transition_freq(task_presence): """ task_presence: 0, 1 array return: num_of_transitions, relative_transition_freq diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 8c16cca..1fb7bd1 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -4,1095 +4,22 @@ import traceback import numpy as np -from scipy.spatial import procrustes from sklearn.cluster import KMeans -from sklearn.decomposition import PCA -from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.manifold import SpectralEmbedding -from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score -from sklearn.model_selection import GridSearchCV, RandomizedSearchCV -from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph -from sklearn.pipeline import make_pipeline +from sklearn.metrics import adjusted_rand_score, silhouette_score from sklearn.preprocessing import StandardScaler -from pydfc import DFC, data_loader, task_utils -from pydfc.dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm +from pydfc.ml_utils import ( + dFC_feature_extraction, + embed_dFC_features, + extract_task_features, + find_available_subjects, + task_presence_classification, + task_presence_clustering, +) ####################################################################################### -def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None): - """ - Find the subjects that have dFC results for the given task and dFC_id (method). - """ - SUBJECTS = list() - ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/") - ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder] - ALL_SUBJ_FOLDERS.sort() - for subj_folder in ALL_SUBJ_FOLDERS: - if session is None: - ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/") - else: - ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/") - ALL_DFC_FILES = [ - dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file - ] - if dFC_id is not None: - ALL_DFC_FILES = [ - dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file - ] - if run is not None: - ALL_DFC_FILES = [ - dFC_file for dFC_file in ALL_DFC_FILES if f"_{run}_" in dFC_file - ] - if session is not None: - ALL_DFC_FILES = [ - dFC_file for dFC_file in ALL_DFC_FILES if f"_{session}_" in dFC_file - ] - ALL_DFC_FILES.sort() - if len(ALL_DFC_FILES) > 0: - SUBJECTS.append(subj_folder) - return SUBJECTS - - -def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, output_root): - """ - Extract task features from the event data.""" - for session in SESSIONS: - task_features = { - "task": list(), - "run": list(), - "relative_task_on": list(), - "avg_task_duration": list(), - "var_task_duration": list(), - "avg_rest_duration": list(), - "var_rest_duration": list(), - "num_of_transitions": list(), - "relative_transition_freq": list(), - } - for task_id, task in enumerate(TASKS): - - if task == "task-restingstate": - continue - - for run in RUNS[task]: - - SUBJECTS = find_available_subjects( - dFC_root=dFC_root, task=task, run=run, session=session - ) - - for subj in SUBJECTS: - # event data - task_data = load_task_data( - roi_root=roi_root, subj=subj, task=task, run=run, session=session - ) - Fs_task = task_data["Fs_task"] - TR_task = 1 / Fs_task - - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=TR_task, - TR_mri=task_data["TR_mri"], - binary=True, - binarizing_method="mean", - ) - - relative_task_on = task_utils.relative_task_on(task_presence) - # task duration - avg_task_duration, var_task_duration = task_utils.task_duration( - task_presence, task_data["TR_mri"] - ) - # rest duration - avg_rest_duration, var_rest_duration = task_utils.rest_duration( - task_presence, task_data["TR_mri"] - ) - # freq of transitions - num_of_transitions, relative_transition_freq = ( - task_utils.transition_freq(task_presence) - ) - - task_features["task"].append(task) - task_features["run"].append(run) - task_features["relative_task_on"].append(relative_task_on) - task_features["avg_task_duration"].append(avg_task_duration) - task_features["var_task_duration"].append(var_task_duration) - task_features["avg_rest_duration"].append(avg_rest_duration) - task_features["var_rest_duration"].append(var_rest_duration) - task_features["num_of_transitions"].append(num_of_transitions) - task_features["relative_transition_freq"].append( - relative_transition_freq - ) - - if session is None: - folder = f"{output_root}" - else: - folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/task_features.npy", task_features) - - -def dFC_feature_extraction_subj_lvl( - dFC, - task_data, - dynamic_pred="no", - normalize_dFC=True, -): - """ - Extract features and target for task presence classification - for a single subject. - dynamic_pred: "no", "past", "past_and_future" - """ - # dFC features - dFC_mat = dFC.get_dFC_mat() - TR_array = dFC.TR_array - if normalize_dFC: - dFC_mat = rank_norm(dFC_mat) - dFC_vecs = dFC_mat2vec(dFC_mat) - - # event data - task_presence = task_utils.extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=1 / task_data["Fs_task"], - TR_mri=task_data["TR_mri"], - TR_array=TR_array, - binary=True, - binarizing_method="mean", - ) - - features = dFC_vecs - target = task_presence.ravel() - - if dynamic_pred == "past": - # concat current TR and two TR before of features to predict the current TR of target - # ignore the edge case of the first two TRs - features = np.concatenate( - (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1 - ) - features = features[2:, :] - target = target[2:] - elif dynamic_pred == "past_and_future": - # concat current TR and two TR before and after of features to predict the current TR of target - # ignore the edge case of the first and last two TRs - features = np.concatenate( - ( - features, - np.roll(features, 1, axis=0), - np.roll(features, 2, axis=0), - np.roll(features, -1, axis=0), - np.roll(features, -2, axis=0), - ), - axis=1, - ) - features = features[2:-2, :] - target = target[2:-2] - - return features, target - - -def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None): - """ - Load the dFC results for a given subject, task, dFC_id, run and session. - """ - if session is None: - if run is None: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - else: - dFC = np.load( - f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE" - ).item() - else: - if run is None: - dFC = np.load( - f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy", - allow_pickle="TRUE", - ).item() - else: - dFC = np.load( - f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy", - allow_pickle="TRUE", - ).item() - - return dFC - - -def load_task_data(roi_root, subj, task, run=None, session=None): - """ - Load the task data for a given subject, task and run. - """ - if session is None: - if run is None: - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE" - ).item() - else: - task_data = np.load( - f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy", - allow_pickle="TRUE", - ).item() - else: - if run is None: - task_data = np.load( - f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy", - allow_pickle="TRUE", - ).item() - else: - task_data = np.load( - f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy", - allow_pickle="TRUE", - ).item() - - return task_data - - -def precheck_for_procruste(X_best, X_subj): - """ - Check if the two matrices have the same number of rows. if not, make them the same. - """ - # for the procrustes transformation, the number of samples should be the same - if X_subj.shape[0] > X_best.shape[0]: - # add zero rows to the embedding of the best subject - X_best_new = np.concatenate( - ( - X_best, - np.zeros( - ( - X_subj.shape[0] - X_best.shape[0], - X_best.shape[1], - ) - ), - ), - axis=0, - ) - elif X_subj.shape[0] < X_best.shape[0]: - # remove extra rows from the embedding of the best subject - X_best_new = X_best[: X_subj.shape[0], :] - else: - X_best_new = X_best - - X_best_new = X_best_new.copy() - - return X_best_new - - -def generalized_procrustes(X_list): - """ - Generalized Procrustes Analysis - - returns the mean X to be used as the reference for procrustes transformation - """ - # initialize Procrustes distance - current_distance = 0 - - # initialize a mean X - mean_X = np.array(X_list[0]) - - num_X = len(X_list) - - # create array for new Xs, add - new_Xs = np.zeros(np.array(X_list).shape) - - while True: - # add the mean X as first element of array - new_Xs[0] = mean_X - - # superimpose all shapes to current mean - for i in range(1, num_X): - _, new_X, _ = procrustes(mean_X, X_list[i]) - new_Xs[i] = new_X - - # calculate new mean - new_mean = np.mean(new_Xs, axis=0) - - _, _, new_distance = procrustes(new_mean, mean_X) - - # if the distance did not change, break the cycle - if np.abs(new_distance - current_distance) < 1e-6: - break - - # align the new_mean to old mean - _, new_mean, _ = procrustes(mean_X, new_mean) - - # update mean and distance - mean_X = new_mean - current_distance = new_distance - - return mean_X - - -def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): - """ - Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space. - """ - affinity_matrix = kneighbors_graph( - X, - n_neighbors=n_neighbors, - mode="connectivity", - include_self=False, - metric=distance_metric, - ) - affinity_matrix = affinity_matrix.toarray() - affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2) - LE = SpectralEmbedding( - n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors - ) - X_embed = LE.fit_transform(X=affinity_matrix) - return X_embed - - -def LE_embed_procustes( - X_train, - X_test, - y_train, - y_test, - subj_label_train, - subj_label_test, - train_subjects, - test_subjects, - n_components=30, - n_neighbors_LE=125, - procruste_method="best_SI", -): - if procruste_method == "best_SI": - # first embed the dFC features of each subject into a lower dimensional space using LE separately - embed_dict = {} - for subject in train_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_train == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_train[subj_label_train == subject, :] - y_subj = y_train[subj_label_train == subject] - X_subj_embed = LE_transform( - X=X_subj, - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - distance_metric="correlation", - ) - SI = silhouette_score(X_subj_embed, y_subj) - embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI} - - # find the best transformation based on the SI score - best_SI = -1 - best_subject = None - for subject in embed_dict: - if embed_dict[subject]["SI"] > best_SI: - best_SI = embed_dict[subject]["SI"] - best_subject = subject - - # apply procrustes transformation to align the embeddings of different subjects - # use the embeddings of the subject with the highest SI score as the reference - X_train_embed = None - for subject in train_subjects: - X_subj_embed = embed_dict[subject]["X_subj_embed"] - # procrustes transformation - if subject == best_subject: - X_subj_embed_transformed = X_subj_embed - else: - # for the procrustes transformation, the number of samples should be the same - X_best_subj_embed = precheck_for_procruste( - embed_dict[best_subject]["X_subj_embed"], X_subj_embed - ) - _, X_subj_embed_transformed, _ = procrustes( - X_best_subj_embed, X_subj_embed - ) - if X_train_embed is None: - X_train_embed = X_subj_embed_transformed - else: - X_train_embed = np.concatenate( - (X_train_embed, X_subj_embed_transformed), axis=0 - ) - - # apply the same transformation to the test set - X_test_embed = None - for subject in test_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_test == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_test[subj_label_test == subject, :] - X_subj_embed = LE_transform( - X=X_subj, - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - distance_metric="correlation", - ) - # procrustes transformation - # for the procrustes transformation, the number of samples should be the same - X_best_subj_embed = precheck_for_procruste( - embed_dict[best_subject]["X_subj_embed"], X_subj_embed - ) - _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed) - if X_test_embed is None: - X_test_embed = X_subj_embed_transformed - else: - X_test_embed = np.concatenate( - (X_test_embed, X_subj_embed_transformed), axis=0 - ) - - elif procruste_method == "generalized": - # in this method we use generalized procrustes analysis to align the embeddings of different subjects - # first embed the dFC features of each subject into a lower dimensional space using LE separately - embed_dict = {} - for subject in train_subjects: - # assert the samples of the same subject are contiguous - assert np.all( - np.diff(np.where(subj_label_train == subject)[0]) == 1 - ), f"Indices of {subject} are not consecutive" - X_subj = X_train[subj_label_train == subject, :] - X_subj_embed = LE_transform( - X=X_subj, - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - distance_metric="correlation", - ) - embed_dict[subject] = X_subj_embed - - # then find the max number of samples among all subjects - max_samples = 0 - for subject in train_subjects: - if embed_dict[subject].shape[0] > max_samples: - max_samples = embed_dict[subject].shape[0] - - # find the mean embedding of all subjects to use as the reference for procrustes transformation - X_train_list = [] - for subject in train_subjects: - X_subj_embed = embed_dict[subject] - # add zero rows to the embedding of the subject with less samples - if X_subj_embed.shape[0] < max_samples: - X_subj_embed_new = np.concatenate( - ( - X_subj_embed, - np.zeros( - ( - max_samples - X_subj_embed.shape[0], - X_subj_embed.shape[1], - ) - ), - ), - axis=0, - ) - else: - X_subj_embed_new = X_subj_embed - X_train_list.append(X_subj_embed_new) - mean_X_train = generalized_procrustes(X_train_list) - - X_train_embed = None - for subject in train_subjects: - X_subj_embed = embed_dict[subject] - mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) - _, X_subj_embed_transformed, _ = procrustes( - mean_X_train_new_size, X_subj_embed - ) - if X_train_embed is None: - X_train_embed = X_subj_embed_transformed - else: - X_train_embed = np.concatenate( - (X_train_embed, X_subj_embed_transformed), axis=0 - ) - - X_test_embed = None - for subject in test_subjects: - X_subj = X_test[subj_label_test == subject, :] - X_subj_embed = LE_transform( - X=X_subj, - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), - distance_metric="correlation", - ) - mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) - _, X_subj_embed_transformed, _ = procrustes( - mean_X_train_new_size, X_subj_embed - ) - if X_test_embed is None: - X_test_embed = X_subj_embed_transformed - else: - X_test_embed = np.concatenate( - (X_test_embed, X_subj_embed_transformed), axis=0 - ) - - return X_train_embed, X_test_embed - - -def embed_dFC_features( - train_subjects, - test_subjects, - X_train, - X_test, - y_train, - y_test, - subj_label_train, - subj_label_test, - embedding="PCA", - n_components=30, - n_neighbors_LE=125, - LE_embedding_method="concat+embed", -): - """ - Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. - - for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects. - All the subjects are transformed into the space of the subject with the highest silhouette score. - - LE_embedding_method: "concat+embed" or "embed+procrustes" - """ - if embedding == "PCA": - pca = PCA(n_components=n_components, svd_solver="full", whiten=False) - pca.fit(X_train) - X_train_embed = pca.transform(X_train) - if X_test is not None: - X_test_embed = pca.transform(X_test) - else: - X_test_embed = None - elif embedding == "LE": - if LE_embedding_method == "embed+procrustes": - X_train_embed, X_test_embed = LE_embed_procustes( - X_train=X_train, - X_test=X_test, - y_train=y_train, - y_test=y_test, - subj_label_train=subj_label_train, - subj_label_test=subj_label_test, - train_subjects=train_subjects, - test_subjects=test_subjects, - n_components=n_components, - n_neighbors_LE=n_neighbors_LE, - procruste_method="generalized", - ) - elif LE_embedding_method == "concat+embed": - # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data - if X_test is not None: - X_concat = np.concatenate((X_train, X_test), axis=0) - else: - X_concat = X_train - X_concat_embed = LE_transform( - X=X_concat, - n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_concat.shape[0]), - distance_metric="correlation", - ) - X_train_embed = X_concat_embed[: X_train.shape[0], :] - if X_test is not None: - X_test_embed = X_concat_embed[X_train.shape[0] :, :] - else: - X_test_embed = None - - return X_train_embed, X_test_embed - - -def dFC_feature_extraction( - task, - train_subjects, - test_subjects, - dFC_id, - roi_root, - dFC_root, - run=None, - session=None, - dynamic_pred="no", - normalize_dFC=True, -): - """ - Extract features and target for task presence classification - for all subjects. - if run is specified, dFC results for that run will be used. - """ - dFC_measure_name = None - X_train = None - y_train = None - subj_label_train = list() - for subj in train_subjects: - - dFC = load_dFC( - dFC_root=dFC_root, - subj=subj, - task=task, - dFC_id=dFC_id, - run=run, - session=session, - ) - task_data = load_task_data( - roi_root=roi_root, subj=subj, task=task, run=run, session=session - ) - - X_subj, y_subj = dFC_feature_extraction_subj_lvl( - dFC=dFC, - task_data=task_data, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - - subj_label_train.extend([subj for i in range(X_subj.shape[0])]) - if X_train is None and y_train is None: - X_train = X_subj - y_train = y_subj - else: - X_train = np.concatenate((X_train, X_subj), axis=0) - y_train = np.concatenate((y_train, y_subj), axis=0) - - if dFC_measure_name is None: - dFC_measure_name = dFC.measure.measure_name - else: - assert ( - dFC_measure_name == dFC.measure.measure_name - ), "dFC measure is not consistent." - - X_test = None - y_test = None - subj_label_test = list() - for subj in test_subjects: - dFC = load_dFC( - dFC_root=dFC_root, - subj=subj, - task=task, - dFC_id=dFC_id, - run=run, - session=session, - ) - task_data = load_task_data( - roi_root=roi_root, subj=subj, task=task, run=run, session=session - ) - - X_subj, y_subj = dFC_feature_extraction_subj_lvl( - dFC=dFC, - task_data=task_data, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - - subj_label_test.extend([subj for i in range(X_subj.shape[0])]) - if X_test is None and y_test is None: - X_test = X_subj - y_test = y_subj - else: - X_test = np.concatenate((X_test, X_subj), axis=0) - y_test = np.concatenate((y_test, y_subj), axis=0) - - if dFC_measure_name is None: - dFC_measure_name = dFC.measure.measure_name - else: - assert ( - dFC_measure_name == dFC.measure.measure_name - ), "dFC measure is not consistent." - - # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) - subj_label_train = np.array(subj_label_train) - subj_label_test = np.array(subj_label_test) - - return ( - X_train, - X_test, - y_train, - y_test, - subj_label_train, - subj_label_test, - dFC_measure_name, - ) - - -def logistic_regression_classify(X_train, y_train, X_test, y_test): - """ - Logistic regression classification - """ - # create a pipeline with a logistic regression model to find the best C - logistic_reg = make_pipeline( - StandardScaler(), LogisticRegression(penalty="l1", solver="saga") - ) - # create a dictionary of all values we want to test for C - param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} - # use gridsearch to test all values for C - lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5) - # fit model to data - lr_gscv.fit(X_train, y_train) - - C = lr_gscv.best_params_["logisticregression__C"] - - log_reg = make_pipeline( - StandardScaler(), - LogisticRegression(penalty="l1", C=C, solver="saga"), - ).fit(X_train, y_train) - - RESULT = { - "log_reg_model": log_reg, - "log_reg_C": C, - "log_reg_train_score": log_reg.score(X_train, y_train), - "log_reg_test_score": log_reg.score(X_test, y_test), - } - - return RESULT - - -def KNN_classify(X_train, y_train, X_test, y_test): - """ - KNN classification - """ - # create a pipeline with a knn model to find the best n_neighbors - knn = make_pipeline( - StandardScaler(), - KNeighborsClassifier(), - ) - # create a dictionary of all values we want to test for n_neighbors - param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)} - # use gridsearch to test all values for n_neighbors - knn_gscv = GridSearchCV(knn, param_grid, cv=5) - # fit model to data - knn_gscv.fit(X_train, y_train) - - n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"] - - neigh = make_pipeline( - StandardScaler(), - KNeighborsClassifier(n_neighbors=n_neighbors), - ).fit(X_train, y_train) - - RESULT = { - "KNN_cv_results": knn_gscv.cv_results_, - "KNN_model": neigh, - "KNN_train_score": neigh.score(X_train, y_train), - "KNN_test_score": neigh.score(X_test, y_test), - } - - return RESULT - - -def random_forest_classify(X_train, y_train, X_test, y_test): - """ - Random Forest classification - """ - # create a pipeline with a random forest model to find the best n_estimators - rf = make_pipeline( - StandardScaler(), - RandomForestClassifier(), - ) - # create a dictionary of all values we want to test for n_estimators - param_grid = { - "randomforestclassifier__n_estimators": [10, 50, 100, 200], - "randomforestclassifier__max_depth": [None, 5, 10, 20, 30], - } - # use gridsearch to test all values for n_estimators - rf_gscv = GridSearchCV(rf, param_grid, cv=5) - # fit model to data - rf_gscv.fit(X_train, y_train) - - n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"] - max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"] - - rf = make_pipeline( - StandardScaler(), - RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth), - ).fit(X_train, y_train) - - RESULT = { - "RF_cv_results": rf_gscv.cv_results_, - "RF_model": rf, - "RF_train_score": rf.score(X_train, y_train), - "RF_test_score": rf.score(X_test, y_test), - } - - return RESULT - - -def gradient_boosting_classify(X_train, y_train, X_test, y_test): - """ - Gradient Boosting classification - """ - # create a pipeline with a gradient boosting model to find the best n_estimators - gb = make_pipeline( - StandardScaler(), - GradientBoostingClassifier(), - ) - # create a dictionary of all values we want to test for n_estimators - param_grid = { - "gradientboostingclassifier__n_estimators": [10, 50, 100, 200], - "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2], - "gradientboostingclassifier__max_depth": [3, 5, 10], - } - # use gridsearch to test all values for n_estimators - gb_gscv = GridSearchCV(gb, param_grid, cv=5) - # fit model to data - gb_gscv.fit(X_train, y_train) - - n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"] - learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"] - max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"] - - gb = make_pipeline( - StandardScaler(), - GradientBoostingClassifier( - n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate - ), - ).fit(X_train, y_train) - - RESULT = { - "GB_cv_results": gb_gscv.cv_results_, - "GB_model": gb, - "GB_train_score": gb.score(X_train, y_train), - "GB_test_score": gb.score(X_test, y_test), - } - - return RESULT - - -def task_presence_classification( - task, - dFC_id, - roi_root, - dFC_root, - run=None, - session=None, - dynamic_pred="no", - normalize_dFC=True, - train_test_ratio=0.8, -): - """ - perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting - for a given task and dFC method and run. - """ - if run is None: - print(f"=============== {task} ===============") - else: - print(f"=============== {task} {run} ===============") - - if task == "task-restingstate": - return - - SUBJECTS = find_available_subjects( - dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id - ) - - # randomly select train_test_ratio of the subjects for training - # and rest for testing using numpy.random.choice - train_subjects = np.random.choice( - SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False - ) - test_subjects = np.setdiff1d(SUBJECTS, train_subjects) - print( - f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}" - ) - - X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = ( - dFC_feature_extraction( - task=task, - train_subjects=train_subjects, - test_subjects=test_subjects, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - ) - - # embed dFC features - X_train, X_test = embed_dFC_features( - train_subjects=train_subjects, - test_subjects=test_subjects, - X_train=X_train, - X_test=X_test, - y_train=y_train, - y_test=y_test, - subj_label_train=subj_label_train, - subj_label_test=subj_label_test, - embedding="LE", - n_components=30, - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) - - # task presence classification - - print("task presence classification ...") - - # logistic regression - log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test) - - # KNN - KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test) - - # # Random Forest - # RF_RESULT = random_forest_classify( - # X_train, y_train, X_test, y_test - # ) - - # # Gradient Boosting - # GBT_RESULT = gradient_boosting_classify( - # X_train, y_train, X_test, y_test - # ) - - ML_RESULT = {} - for key in log_reg_RESULT: - ML_RESULT[key] = log_reg_RESULT[key] - for key in KNN_RESULT: - ML_RESULT[key] = KNN_RESULT[key] - # for key in RF_RESULT: - # ML_RESULT[key] = RF_RESULT[key] - # for key in GBT_RESULT: - # ML_RESULT[key] = GBT_RESULT[key] - - # measure pred score on each subj - - ML_scores = { - "subj_id": list(), - "group": list(), - "task": list(), - "run": list(), - "dFC method": list(), - "Logistic regression accuracy": list(), - "KNN accuracy": list(), - # "Random Forest accuracy": list(), - # "Gradient Boosting accuracy": list(), - } - log_reg = log_reg_RESULT["log_reg_model"] - KNN = KNN_RESULT["KNN_model"] - # RF = RF_RESULT["RF_model"] - # GBT = GBT_RESULT["GB_model"] - - for subj in SUBJECTS: - ML_scores["subj_id"].append(subj) - if subj in train_subjects: - ML_scores["group"].append("train") - features = X_train[subj_label_train == subj, :] - target = y_train[subj_label_train == subj] - elif subj in test_subjects: - ML_scores["group"].append("test") - features = X_test[subj_label_test == subj, :] - target = y_test[subj_label_test == subj] - - pred_lr = log_reg.predict(features) - pred_KNN = KNN.predict(features) - # pred_RF = RF.predict(features) - # pred_GBT = GBT.predict(features) - - ML_scores["Logistic regression accuracy"].append( - balanced_accuracy_score(target, pred_lr) - ) - ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) - # ML_scores["Random Forest accuracy"].append( - # balanced_accuracy_score(target, pred_RF) - # ) - # ML_scores["Gradient Boosting accuracy"].append( - # balanced_accuracy_score(target, pred_GBT) - # ) - - ML_scores["task"].append(task) - ML_scores["run"].append(run) - ML_scores["dFC method"].append(measure_name) - - return ML_RESULT, ML_scores - - -def task_presence_clustering( - task, - dFC_id, - roi_root, - dFC_root, - run=None, - session=None, - normalize_dFC=True, -): - if run is None: - print(f"=============== {task} ===============") - else: - print(f"=============== {task} {run} ===============") - - if task == "task-restingstate": - return - - SUBJECTS = find_available_subjects( - dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id - ) - - print(f"Number of subjects: {len(SUBJECTS)}") - - X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction( - task=task, - train_subjects=SUBJECTS, - test_subjects=[], - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - dynamic_pred="no", - normalize_dFC=normalize_dFC, - ) - - # embed dFC features - X, _ = embed_dFC_features( - train_subjects=SUBJECTS, - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=subj_label, - subj_label_test=None, - embedding="LE", - n_components=30, - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) - - # clustering - # apply kmeans clustering to dFC features - - n_clusters = 2 # corresponding to task and rest - - scaler = StandardScaler() - X_normalized = scaler.fit_transform(X) - kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) - labels_pred = kmeans.fit_predict(X_normalized) - - # ARI score - print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") - - # # visualize clustering centroids - # centroids = kmeans.cluster_centers_ - # centroids = pca.inverse_transform(centroids) - # centroids = scaler.inverse_transform(centroids) - # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) - # centroids_mat = dFC_vec2mat(centroids, n_regions) - - clustering_RESULTS = { - "StandardScaler": scaler, - "kmeans": kmeans, - "ARI": adjusted_rand_score(y, labels_pred), - # "centroids": centroids_mat, - } - - clustering_scores = { - "subj_id": list(), - "task": list(), - "run": list(), - "dFC method": list(), - "Kmeans ARI": list(), - "SI": list(), - } - for subj in SUBJECTS: - clustering_scores["subj_id"].append(subj) - features = X[subj_label == subj, :] - target = y[subj_label == subj] - - features_normalized = scaler.transform(features) - pred_kmeans = kmeans.predict(features_normalized) - - clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans)) - - # silhouette score in terms of separability of original labels, not the clustering labels - clustering_scores["SI"].append(silhouette_score(features, target)) - - clustering_scores["task"].append(task) - clustering_scores["run"].append(run) - clustering_scores["dFC method"].append(measure_name) - - return clustering_RESULTS, clustering_scores - - def run_classification( dFC_id, TASKS, @@ -1204,7 +131,7 @@ def run_clustering( np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores) -def task_paradigm_clustering( +def run_task_paradigm_clustering( dFC_id, TASKS, RUNS, @@ -1398,6 +325,7 @@ def task_paradigm_clustering( RUNS=RUNS, SESSIONS=SESSIONS, roi_root=roi_root, + dFC_root=dFC_root, output_root=ML_root, ) print("Task features extraction finished.") @@ -1442,7 +370,7 @@ def task_paradigm_clustering( print(f"Task paradigm clustering started for dFC ID {dFC_id} ...") try: - task_paradigm_clustering( + run_task_paradigm_clustering( dFC_id=dFC_id, TASKS=TASKS, RUNS=RUNS, diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 21bc05b..36b1527 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -353,17 +353,17 @@ def calculate_subj_lvl_task_presence_characteristics( binary=True, binarizing_method="mean", ) - relative_task_on = task_utils.relative_task_on(task_presence) + relative_task_on = task_utils.calc_relative_task_on(task_presence) # task duration - avg_task_duration, var_task_duration = task_utils.task_duration( + avg_task_duration, var_task_duration = task_utils.calc_task_duration( task_presence, task_data["TR_mri"] ) # rest duration - avg_rest_duration, var_rest_duration = task_utils.rest_duration( + avg_rest_duration, var_rest_duration = task_utils.calc_rest_duration( task_presence, task_data["TR_mri"] ) # freq of transitions - num_of_transitions, relative_transition_freq = task_utils.transition_freq( + num_of_transitions, relative_transition_freq = task_utils.calc_transition_freq( task_presence ) From 45e3fd5d82fe84ecff69e329f502e69f7d89211c Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 8 Aug 2024 12:19:21 -0400 Subject: [PATCH 105/401] minor change --- pydfc/ml_utils.py | 111 +++++++++++++++++++++++++++++++++++++++++++ task_dFC/ML.py | 118 ++++------------------------------------------ 2 files changed, 120 insertions(+), 109 deletions(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index 275753b..ad3ad79 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -1113,3 +1113,114 @@ def task_presence_clustering( clustering_scores["dFC method"].append(measure_name) return clustering_RESULTS, clustering_scores + + +def task_paradigm_clustering( + dFC_id, + TASKS, + RUNS, + session, + roi_root, + dFC_root, + normalize_dFC=True, +): + # find SUBJECTS common to all tasks + for task_id, task in enumerate(TASKS): + if task_id == 0: + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, dFC_id=dFC_id + ) + else: + SUBJECTS = np.intersect1d( + SUBJECTS, + find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id), + ) + print(f"Number of subjects: {len(SUBJECTS)}") + + X = None + y = None + subj_label = None + measure_name = None + for task_id, task in enumerate(TASKS): + for run in RUNS[task]: + X_new, _, _, _, subj_label_new, _, measure_name_new = dFC_feature_extraction( + task=task, + train_subjects=SUBJECTS, + test_subjects=[], + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred="no", + normalize_dFC=normalize_dFC, + ) + + if measure_name is not None: + assert measure_name == measure_name_new, "dFC measure is not consistent." + else: + measure_name = measure_name_new + + y_new = np.ones(X_new.shape[0]) * task_id + if X is None and y is None: + X = X_new + y = y_new + subj_label = subj_label_new + else: + X = np.concatenate((X, X_new), axis=0) + y = np.concatenate((y, y_new), axis=0) + subj_label = np.concatenate((subj_label, subj_label_new), axis=0) + + assert X.shape[0] == y.shape[0], "Number of samples do not match." + assert X.shape[0] == subj_label.shape[0], "Number of samples do not match." + + # rearrange the order of the samples so that the samples of the same subject are together + idx = np.argsort(subj_label) + X = X[idx, :] + y = y[idx] + subj_label = subj_label[idx] + + # embed dFC features + X_embed, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding="LE", + n_components=30, + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + + # clustering + # apply kmeans clustering to dFC features + + n_clusters = len(TASKS) # corresponding to task paradigms + + scaler = StandardScaler() + X_normalized = scaler.fit_transform(X_embed) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) + labels_pred = kmeans.fit_predict(X_normalized) + + # # visualize clustering centroids + # centroids = kmeans.cluster_centers_ + # centroids = pca.inverse_transform(centroids) + # centroids = scaler.inverse_transform(centroids) + # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + # centroids_mat = dFC_vec2mat(centroids, n_regions) + + task_paradigm_clstr_RESULTS = { + "dFC_method": measure_name, + "StandardScaler": scaler, + "kmeans": kmeans, + "ARI": adjusted_rand_score(y, labels_pred), + "SI": silhouette_score(X_normalized, y), + # "centroids": centroids_mat, + "task_paradigms": TASKS, + } + + return task_paradigm_clstr_RESULTS diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 1fb7bd1..d44e449 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -4,15 +4,10 @@ import traceback import numpy as np -from sklearn.cluster import KMeans -from sklearn.metrics import adjusted_rand_score, silhouette_score -from sklearn.preprocessing import StandardScaler from pydfc.ml_utils import ( - dFC_feature_extraction, - embed_dFC_features, extract_task_features, - find_available_subjects, + task_paradigm_clustering, task_presence_classification, task_presence_clustering, ) @@ -142,112 +137,17 @@ def run_task_paradigm_clustering( normalize_dFC=True, ): for session in SESSIONS: - # find SUBJECTS common to all tasks - for task_id, task in enumerate(TASKS): - if task_id == 0: - SUBJECTS = find_available_subjects( - dFC_root=dFC_root, task=task, dFC_id=dFC_id - ) - else: - SUBJECTS = np.intersect1d( - SUBJECTS, - find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id), - ) - print(f"Number of subjects: {len(SUBJECTS)}") - - X = None - y = None - subj_label = None - measure_name = None - for task_id, task in enumerate(TASKS): - for run in RUNS[task]: - X_new, _, _, _, subj_label_new, _, measure_name_new = ( - dFC_feature_extraction( - task=task, - train_subjects=SUBJECTS, - test_subjects=[], - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - dynamic_pred="no", - normalize_dFC=normalize_dFC, - ) - ) - - if measure_name is not None: - assert ( - measure_name == measure_name_new - ), "dFC measure is not consistent." - else: - measure_name = measure_name_new - y_new = np.ones(X_new.shape[0]) * task_id - if X is None and y is None: - X = X_new - y = y_new - subj_label = subj_label_new - else: - X = np.concatenate((X, X_new), axis=0) - y = np.concatenate((y, y_new), axis=0) - subj_label = np.concatenate((subj_label, subj_label_new), axis=0) - - assert X.shape[0] == y.shape[0], "Number of samples do not match." - assert X.shape[0] == subj_label.shape[0], "Number of samples do not match." - - # rearrange the order of the samples so that the samples of the same subject are together - idx = np.argsort(subj_label) - X = X[idx, :] - y = y[idx] - subj_label = subj_label[idx] - - # embed dFC features - X, _ = embed_dFC_features( - train_subjects=SUBJECTS, - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=subj_label, - subj_label_test=None, - embedding="LE", - n_components=30, - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", + task_paradigm_clstr_RESULTS = task_paradigm_clustering( + dFC_id=dFC_id, + TASKS=TASKS, + RUNS=RUNS, + session=session, + roi_root=roi_root, + dFC_root=dFC_root, + normalize_dFC=normalize_dFC, ) - # clustering - # apply kmeans clustering to dFC features - - n_clusters = len(TASKS) # corresponding to task paradigms - - scaler = StandardScaler() - X_normalized = scaler.fit_transform(X) - kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) - labels_pred = kmeans.fit_predict(X_normalized) - - # ARI score - print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") - - # # visualize clustering centroids - # centroids = kmeans.cluster_centers_ - # centroids = pca.inverse_transform(centroids) - # centroids = scaler.inverse_transform(centroids) - # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) - # centroids_mat = dFC_vec2mat(centroids, n_regions) - - task_paradigm_clstr_RESULTS = { - "dFC_method": measure_name, - "StandardScaler": scaler, - "kmeans": kmeans, - "ARI": adjusted_rand_score(y, labels_pred), - "SI": silhouette_score(X_normalized, y), - # "centroids": centroids_mat, - "task_paradigms": TASKS, - } - if session is None: folder = f"{output_root}" else: From 34e40e152744dea485b92fe3f91ca1b0f72be52a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 13 Aug 2024 22:37:36 -0400 Subject: [PATCH 106/401] add intrinsic dim estimate --- pydfc/ml_utils.py | 162 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 154 insertions(+), 8 deletions(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index ad3ad79..72d35d5 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -9,14 +9,15 @@ import numpy as np from scipy.spatial import procrustes +from scipy.stats import zscore from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.manifold import SpectralEmbedding from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV -from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph +from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler @@ -449,6 +450,132 @@ def generalized_procrustes(X_list): return mean_X +def twonn(X, discard_ratio=0.1): + """ + Calculates intrinsic dimension of the provided data points with the TWO-NN algorithm. + + ----------- + Parameters: + + X : 2d array-like + (n_samples, n_features) + discard_fraction : float between 0 and 1 + Fraction of largest distances to discard (heuristic from the paper) + + Returns: + + d : float + Intrinsic dimension of the dataset according to TWO-NN. + """ + + num_samples = X.shape[0] + + NN = NearestNeighbors(n_neighbors=30) + NN.fit(X) + distances, _ = NN.kneighbors(return_distance=True) + + mu = np.zeros((num_samples)) + for i in range(num_samples): + # find the two nearest neighbors that have different distances and the distance is not 0 + r1, r2 = None, None + for j in range(distances.shape[1]): + if distances[i, j] != 0: + if r1 is None: + r1 = distances[i, j] + elif distances[i, j] != r1: + r2 = distances[i, j] + break + if r1 is not None and r2 is not None: + mu[i] = r2 / r1 + else: + mu[i] = np.nan + + # discard NaN values + mu = mu[~np.isnan(mu)] + # large distances will cause the estimation to be biased, discard them + mu = mu[np.argsort(mu)[: int((1 - discard_ratio) * num_samples)]] + + # CDF + CDF = np.arange(1, 1 + len(mu)) / num_samples + # Fit the formula: log(1 - CDF) = d * log(mu) + lr = LinearRegression(fit_intercept=False) + lr.fit(np.log(mu).reshape(-1, 1), -np.log(1 - CDF).reshape(-1, 1)) + d = lr.coef_[0][0] + + return d + + +def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125): + """ + Find the intrinsic dimension of the data based on the silhouette score. + """ + + SI_score = {} + for n_components in search_range: + X_train_embed, _ = embed_dFC_features( + train_subjects=["subj"], + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=np.array(["subj"] * len(y)), + subj_label_test=None, + embedding="LE", + n_components=n_components, + n_neighbors_LE=n_neighbors_LE, + LE_embedding_method="embed+procrustes", + ) + + SI_score[n_components] = silhouette_score(X_train_embed, y) + + # find the intrinsic dimension based on the silhouette score + intrinsic_dim = max(SI_score, key=SI_score.get) + + return intrinsic_dim + + +def find_intrinsic_dim( + X, + y, + subj_label, + subjects, + method="SI", + n_neighbors_LE=125, + search_range_SI=range(2, 50, 5), +): + """ + Find the number of components to use for embedding the data using LE. + Find the average intrinsic dimension across all subjects. + + method: "SI" or "twonn" + + Returns: + intrinsic_dim: number of components to use for embedding + """ + if method == "SI": + intrinsic_dim_all = list() + for subject in subjects: + X_subj = X[subj_label == subject, :] + y_subj = y[subj_label == subject] + intrinsic_dim_all.append( + SI_ID( + X_subj, + y_subj, + search_range=search_range_SI, + n_neighbors_LE=n_neighbors_LE, + ) + ) + intrinsic_dim = int(np.mean(intrinsic_dim_all)) + elif method == "twonn": + intrinsic_dim_all = list() + for subject in subjects: + X_subj = X[subj_label == subject, :] + intrinsic_dim_all.append(twonn(X_subj, discard_ratio=0.1)) + intrinsic_dim = int(np.mean(intrinsic_dim_all)) + return intrinsic_dim + + def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): """ Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space. @@ -653,9 +780,9 @@ def embed_dFC_features( subj_label_train, subj_label_test, embedding="PCA", - n_components=30, + n_components="auto", n_neighbors_LE=125, - LE_embedding_method="concat+embed", + LE_embedding_method="embed+procrustes", ): """ Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous. @@ -666,7 +793,11 @@ def embed_dFC_features( LE_embedding_method: "concat+embed" or "embed+procrustes" """ if embedding == "PCA": - pca = PCA(n_components=n_components, svd_solver="full", whiten=False) + # if n_components is not specified, use 95% of the variance + if n_components == "auto": + pca = PCA(n_components=0.95, svd_solver="full", whiten=False) + else: + pca = PCA(n_components=n_components, svd_solver="full", whiten=False) pca.fit(X_train) X_train_embed = pca.transform(X_train) if X_test is not None: @@ -674,6 +805,18 @@ def embed_dFC_features( else: X_test_embed = None elif embedding == "LE": + # if n_components is not specified, find the intrinsic dimension of the data using training set and based on the silhouette score + if n_components == "auto": + n_components = find_intrinsic_dim( + X=X_train, + y=y_train, + subj_label=subj_label_train, + subjects=train_subjects, + method="SI", + n_neighbors_LE=n_neighbors_LE, + search_range_SI=range(2, 50, 5), + ) + if LE_embedding_method == "embed+procrustes": X_train_embed, X_test_embed = LE_embed_procustes( X_train=X_train, @@ -918,7 +1061,7 @@ def task_presence_classification( subj_label_train=subj_label_train, subj_label_test=subj_label_test, embedding="LE", - n_components=30, + n_components="auto", n_neighbors_LE=125, LE_embedding_method="embed+procrustes", ) @@ -1055,7 +1198,7 @@ def task_presence_clustering( subj_label_train=subj_label, subj_label_test=None, embedding="LE", - n_components=30, + n_components="auto", n_neighbors_LE=125, LE_embedding_method="embed+procrustes", ) @@ -1156,6 +1299,9 @@ def task_paradigm_clustering( normalize_dFC=normalize_dFC, ) + # normalize the features + X_new = zscore(X_new, axis=0) + if measure_name is not None: assert measure_name == measure_name_new, "dFC measure is not consistent." else: @@ -1191,7 +1337,7 @@ def task_paradigm_clustering( subj_label_train=subj_label, subj_label_test=None, embedding="LE", - n_components=30, + n_components="auto", n_neighbors_LE=125, LE_embedding_method="embed+procrustes", ) From 013d01d0bad64b88e5d48238cda7daa33449b243 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 20 Aug 2024 13:36:18 -0400 Subject: [PATCH 107/401] minor change --- pydfc/ml_utils.py | 30 +++++++++++++++++++++++------- task_dFC/generate_report.py | 12 ++++++++++-- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index 72d35d5..5c17002 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -6,6 +6,7 @@ @author: Mohammad Torabi """ import os +import warnings import numpy as np from scipy.spatial import procrustes @@ -579,10 +580,23 @@ def find_intrinsic_dim( def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): """ Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space. + + if n_neighbors >= n_samples, n_neighbors will be changed to the lower limit n_neighbors """ + min_n_neighbors = 70 + + if n_neighbors >= X.shape[0]: + n_neighbors_to_be_used = min_n_neighbors + # raise a warning + warnings.warn( + "n_neighbors is larger than the number of samples. n_neighbors is set to the minimum value of 70." + ) + else: + n_neighbors_to_be_used = n_neighbors + affinity_matrix = kneighbors_graph( X, - n_neighbors=n_neighbors, + n_neighbors=n_neighbors_to_be_used, mode="connectivity", include_self=False, metric=distance_metric, @@ -590,7 +604,9 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): affinity_matrix = affinity_matrix.toarray() affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2) LE = SpectralEmbedding( - n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors + n_components=n_components, + affinity="precomputed", + n_neighbors=n_neighbors_to_be_used, ) X_embed = LE.fit_transform(X=affinity_matrix) return X_embed @@ -622,7 +638,7 @@ def LE_embed_procustes( X_subj_embed = LE_transform( X=X_subj, n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + n_neighbors=n_neighbors_LE, distance_metric="correlation", ) SI = silhouette_score(X_subj_embed, y_subj) @@ -670,7 +686,7 @@ def LE_embed_procustes( X_subj_embed = LE_transform( X=X_subj, n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + n_neighbors=n_neighbors_LE, distance_metric="correlation", ) # procrustes transformation @@ -699,7 +715,7 @@ def LE_embed_procustes( X_subj_embed = LE_transform( X=X_subj, n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + n_neighbors=n_neighbors_LE, distance_metric="correlation", ) embed_dict[subject] = X_subj_embed @@ -753,7 +769,7 @@ def LE_embed_procustes( X_subj_embed = LE_transform( X=X_subj, n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_subj.shape[0]), + n_neighbors=n_neighbors_LE, distance_metric="correlation", ) mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed) @@ -840,7 +856,7 @@ def embed_dFC_features( X_concat_embed = LE_transform( X=X_concat, n_components=n_components, - n_neighbors=min(n_neighbors_LE, X_concat.shape[0]), + n_neighbors=n_neighbors_LE, distance_metric="correlation", ) X_train_embed = X_concat_embed[: X_train.shape[0], :] diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 36b1527..4c99d88 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -507,6 +507,8 @@ def plot_ML_results( capsize=0.1, ) g.axhline(0.5, color="r", linestyle="--") + # set the y-axis upper limit to 1, but not set the lower limit + g.set(ylim=(None, 1)) if show_title: g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) @@ -603,6 +605,8 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): capsize=0.1, ) g.axhline(0.0, color="r", linestyle="--") + # set the y-axis upper limit to 1, but not set the lower limit + g.set(ylim=(None, 1)) if show_title: g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) @@ -645,7 +649,8 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): dodge=True, capsize=0.1, ) - + # set the y-axis upper limit to 1, but not set the lower limit + g.set(ylim=(None, 1)) if show_title: g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"}) # save the figure @@ -741,6 +746,8 @@ def plot_paradigm_clustering_score( capsize=0.1, ) g.axhline(0.0, color="r", linestyle="--") + # set the y-axis upper limit to 1, but not set the lower limit + g.set(ylim=(None, 1)) if show_title: g.set_title( "Task Paradigm Clustering Performance", @@ -776,7 +783,8 @@ def plot_paradigm_clustering_score( dodge=True, capsize=0.1, ) - + # set the y-axis upper limit to 1, but not set the lower limit + g.set(ylim=(None, 1)) if show_title: g.set_title( "Task Paradigm Clustering Performance", From 1504f86acd00d3c1dfe8493881bb7a6f2342e33a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 29 Aug 2024 21:33:09 -0400 Subject: [PATCH 108/401] change generalized procruste --- .flake8 | 1 + pydfc/ml_utils.py | 73 +++++++++++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 27 deletions(-) diff --git a/.flake8 b/.flake8 index 7f73516..b57c737 100644 --- a/.flake8 +++ b/.flake8 @@ -25,6 +25,7 @@ ignore = E731, E713, E714, + E722, E741, F403, F405, diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index 5c17002..de8a37e 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -412,43 +412,61 @@ def generalized_procrustes(X_list): returns the mean X to be used as the reference for procrustes transformation """ - # initialize Procrustes distance - current_distance = 0 + for iter_num in range(100): - # initialize a mean X - mean_X = np.array(X_list[0]) + try: + # initialize Procrustes distance + current_distance = 0 - num_X = len(X_list) + num_X = len(X_list) - # create array for new Xs, add - new_Xs = np.zeros(np.array(X_list).shape) + # initialize a mean X by randomly selecting + # one of the Xs using np.random.choice + mean_X = X_list[np.random.choice(num_X)] - while True: - # add the mean X as first element of array - new_Xs[0] = mean_X + # create array for new Xs, add + new_Xs = np.zeros(np.array(X_list).shape) - # superimpose all shapes to current mean - for i in range(1, num_X): - _, new_X, _ = procrustes(mean_X, X_list[i]) - new_Xs[i] = new_X + counter = 0 + flag = False + while True: + counter += 1 + if counter > 1e6: + # if the algorithm does not converge, break the cycle + # to avoid infinite loop + flag = True + break + + # add the mean X as first element of array + new_Xs[0] = mean_X + + # superimpose all shapes to current mean + for i in range(1, num_X): + _, new_X, _ = procrustes(mean_X, X_list[i]) + new_Xs[i] = new_X - # calculate new mean - new_mean = np.mean(new_Xs, axis=0) + # calculate new mean + new_mean = np.mean(new_Xs, axis=0) - _, _, new_distance = procrustes(new_mean, mean_X) + _, _, new_distance = procrustes(new_mean, mean_X) + + # if the distance did not change, break the cycle + if np.abs(new_distance - current_distance) < 1e-6: + break - # if the distance did not change, break the cycle - if np.abs(new_distance - current_distance) < 1e-6: - break + # align the new_mean to old mean + _, new_mean, _ = procrustes(mean_X, new_mean) - # align the new_mean to old mean - _, new_mean, _ = procrustes(mean_X, new_mean) + # update mean and distance + mean_X = new_mean + current_distance = new_distance - # update mean and distance - mean_X = new_mean - current_distance = new_distance + if not flag: + return mean_X + except: + continue - return mean_X + raise ValueError("Generalized Procrustes Analysis did not converge.") def twonn(X, discard_ratio=0.1): @@ -477,7 +495,8 @@ def twonn(X, discard_ratio=0.1): mu = np.zeros((num_samples)) for i in range(num_samples): - # find the two nearest neighbors that have different distances and the distance is not 0 + # find the two nearest neighbors that have + # different distances and the distance is not 0 r1, r2 = None, None for j in range(distances.shape[1]): if distances[i, j] != 0: From 55cb8decc52ad02793dcb39df444ad7d91debb1e Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 10 Sep 2024 14:51:01 -0400 Subject: [PATCH 109/401] fix bugs --- pydfc/ml_utils.py | 129 +++++++++++++++++++++----------------------- pydfc/task_utils.py | 6 +++ task_dFC/ML.py | 75 +++++++++++++++++++++----- 3 files changed, 127 insertions(+), 83 deletions(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index de8a37e..857353f 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -128,81 +128,72 @@ def load_task_data(roi_root, subj, task, run=None, session=None): ################################# Feature Extraction Functions #################################### -def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, dFC_root, output_root): +def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root): """ Extract task features from the event data.""" - for session in SESSIONS: - task_features = { - "task": list(), - "run": list(), - "relative_task_on": list(), - "avg_task_duration": list(), - "var_task_duration": list(), - "avg_rest_duration": list(), - "var_rest_duration": list(), - "num_of_transitions": list(), - "relative_transition_freq": list(), - } - for task_id, task in enumerate(TASKS): - - if task == "task-restingstate": - continue - - for run in RUNS[task]: - - SUBJECTS = find_available_subjects( - dFC_root=dFC_root, task=task, run=run, session=session - ) + task_features = { + "task": list(), + "run": list(), + "relative_task_on": list(), + "avg_task_duration": list(), + "var_task_duration": list(), + "avg_rest_duration": list(), + "var_rest_duration": list(), + "num_of_transitions": list(), + "relative_transition_freq": list(), + } + for task_id, task in enumerate(TASKS): - for subj in SUBJECTS: - # event data - task_data = load_task_data( - roi_root=roi_root, subj=subj, task=task, run=run, session=session - ) - Fs_task = task_data["Fs_task"] - TR_task = 1 / Fs_task - - task_presence = extract_task_presence( - event_labels=task_data["event_labels"], - TR_task=TR_task, - TR_mri=task_data["TR_mri"], - binary=True, - binarizing_method="mean", - ) + if task == "task-restingstate": + continue - relative_task_on = calc_relative_task_on(task_presence) - # task duration - avg_task_duration, var_task_duration = calc_task_duration( - task_presence, task_data["TR_mri"] - ) - # rest duration - avg_rest_duration, var_rest_duration = calc_rest_duration( - task_presence, task_data["TR_mri"] - ) - # freq of transitions - num_of_transitions, relative_transition_freq = calc_transition_freq( - task_presence - ) + for run in RUNS[task]: - task_features["task"].append(task) - task_features["run"].append(run) - task_features["relative_task_on"].append(relative_task_on) - task_features["avg_task_duration"].append(avg_task_duration) - task_features["var_task_duration"].append(var_task_duration) - task_features["avg_rest_duration"].append(avg_rest_duration) - task_features["var_rest_duration"].append(var_rest_duration) - task_features["num_of_transitions"].append(num_of_transitions) - task_features["relative_transition_freq"].append( - relative_transition_freq - ) + SUBJECTS = find_available_subjects( + dFC_root=dFC_root, task=task, run=run, session=session + ) - if session is None: - folder = f"{output_root}" - else: - folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) - np.save(f"{folder}/task_features.npy", task_features) + for subj in SUBJECTS: + # event data + task_data = load_task_data( + roi_root=roi_root, subj=subj, task=task, run=run, session=session + ) + Fs_task = task_data["Fs_task"] + TR_task = 1 / Fs_task + + task_presence = extract_task_presence( + event_labels=task_data["event_labels"], + TR_task=TR_task, + TR_mri=task_data["TR_mri"], + binary=True, + binarizing_method="mean", + ) + + relative_task_on = calc_relative_task_on(task_presence) + # task duration + avg_task_duration, var_task_duration = calc_task_duration( + task_presence, task_data["TR_mri"] + ) + # rest duration + avg_rest_duration, var_rest_duration = calc_rest_duration( + task_presence, task_data["TR_mri"] + ) + # freq of transitions + num_of_transitions, relative_transition_freq = calc_transition_freq( + task_presence + ) + + task_features["task"].append(task) + task_features["run"].append(run) + task_features["relative_task_on"].append(relative_task_on) + task_features["avg_task_duration"].append(avg_task_duration) + task_features["var_task_duration"].append(var_task_duration) + task_features["avg_rest_duration"].append(avg_rest_duration) + task_features["var_rest_duration"].append(var_rest_duration) + task_features["num_of_transitions"].append(num_of_transitions) + task_features["relative_transition_freq"].append(relative_transition_freq) + + return task_features def dFC_feature_extraction_subj_lvl( diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py index 8a11cbf..4fa8f0d 100644 --- a/pydfc/task_utils.py +++ b/pydfc/task_utils.py @@ -336,6 +336,9 @@ def calc_task_duration(task_presence, TR_mri): task_durations.append((end - start) * TR_mri) start = None task_durations = np.array(task_durations) + # find mean and variance of task durations with division error handling + if len(task_durations) == 0: + return 0, 0 return np.mean(task_durations), np.var(task_durations) @@ -358,6 +361,9 @@ def calc_rest_duration(task_presence, TR_mri): end = len(task_presence) rest_durations.append((end - start) * TR_mri) rest_durations = np.array(rest_durations) + # find mean and variance of rest durations with division error handling + if len(rest_durations) == 0: + return 0, 0 return np.mean(rest_durations), np.var(rest_durations) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index d44e449..a792130 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -15,6 +15,39 @@ ####################################################################################### +def run_task_features_extraction( + TASKS, + RUNS, + SESSIONS, + roi_root, + dFC_root, + output_root, +): + for session in SESSIONS: + task_features = extract_task_features( + TASKS=TASKS, + RUNS=RUNS, + session=session, + roi_root=roi_root, + dFC_root=dFC_root, + ) + + if session is None: + folder = f"{output_root}" + else: + folder = f"{output_root}/{session}" + try: + if not os.path.exists(folder): + os.makedirs(folder) + except OSError as err: + print(err) + try: + if not os.path.exists(f"{folder}/task_features.npy"): + np.save(f"{folder}/task_features.npy", task_features) + except OSError as err: + print(err) + + def run_classification( dFC_id, TASKS, @@ -66,8 +99,11 @@ def run_classification( folder = f"{output_root}" else: folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) + try: + if not os.path.exists(folder): + os.makedirs(folder) + except OSError as err: + print(err) np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT) np.save(f"{folder}/ML_scores_classify_{dFC_id}.npy", ML_scores) @@ -119,8 +155,11 @@ def run_clustering( folder = f"{output_root}" else: folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) + try: + if not os.path.exists(folder): + os.makedirs(folder) + except OSError as err: + print(err) np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS) np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores) @@ -152,8 +191,11 @@ def run_task_paradigm_clustering( folder = f"{output_root}" else: folder = f"{output_root}/{session}" - if not os.path.exists(folder): - os.makedirs(folder) + try: + if not os.path.exists(folder): + os.makedirs(folder) + except OSError as err: + print(err) np.save( f"{folder}/task_paradigm_clstr_RESULTS_{dFC_id}.npy", @@ -220,14 +262,19 @@ def run_task_paradigm_clustering( else: ML_root = dataset_info["ML_root"] - extract_task_features( - TASKS=TASKS, - RUNS=RUNS, - SESSIONS=SESSIONS, - roi_root=roi_root, - dFC_root=dFC_root, - output_root=ML_root, - ) + # The task feature extraction will be executed multiple times in parallel redundantly + try: + run_task_features_extraction( + TASKS=TASKS, + RUNS=RUNS, + SESSIONS=SESSIONS, + roi_root=roi_root, + dFC_root=dFC_root, + output_root=ML_root, + ) + except Exception as e: + print(f"Error in task features extraction: {e}") + traceback.print_exc() print("Task features extraction finished.") job_id = int(os.getenv("SGE_TASK_ID")) From b5be00bd455e3acac2ce88172b6988134baaa5b7 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 10 Sep 2024 15:06:59 -0400 Subject: [PATCH 110/401] handle common bold.json --- task_dFC/nifti_to_roi_signal.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 3953865..56880e4 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -65,6 +65,26 @@ def run_roi_signal_extraction( nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}" task_events_root = f"{main_root}/bids/{subj}/{session}/func" info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}" + + # in some cases the info file is common for all subjects and can be found in f"{main_root}/bids" + if not os.path.exists(info_file): + ALL_COMMON_FILES = os.listdir(f"{main_root}/bids/") + ALL_COMMON_FILES = [ + file_i + for file_i in ALL_COMMON_FILES + if (f"{task}_" in file_i) and ("_bold.json" in file_i) + ] + if len(ALL_COMMON_FILES) == 1: + info_file = f"{main_root}/bids/{ALL_COMMON_FILES[0]}" + if not os.path.exists(info_file): + # if the info file is not found, exclude the subject + if run is None: + print(f"bold.json info file not found for {subj} {session_str} {task}") + else: + print( + f"bold.json info file not found for {subj} {session_str} {task} {run}" + ) + return ################################# LOAD JSON INFO ######################### # Opening JSON file as a dictionary f = open(info_file) From 4a6405bd5d1429bbf42bab19781fffac5465e537 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 12 Sep 2024 11:50:51 -0400 Subject: [PATCH 111/401] add slurm run scripts --- pydfc/ml_utils.py | 1 + .../dataset_info.json | 0 .../global_configs.json | 0 .../methods_config.json | 0 .../run_FCS.sh | 0 .../run_ML.sh | 0 .../run_dFC.sh | 0 .../run_fmriprep.sh | 0 .../run_nifti_to_roi.sh | 0 .../run_report.sh | 0 task_dFC/run_scripts_slurm/dataset_info.json | 22 ++++++++ .../run_scripts_slurm/global_configs.json | 54 +++++++++++++++++++ .../run_scripts_slurm/methods_config.json | 35 ++++++++++++ task_dFC/run_scripts_slurm/run_FCS.sh | 18 +++++++ task_dFC/run_scripts_slurm/run_ML.sh | 16 ++++++ task_dFC/run_scripts_slurm/run_dFC.sh | 23 ++++++++ task_dFC/run_scripts_slurm/run_fmriprep.sh | 24 +++++++++ .../run_scripts_slurm/run_nifti_to_roi.sh | 23 ++++++++ task_dFC/run_scripts_slurm/run_report.sh | 18 +++++++ 19 files changed, 234 insertions(+) rename task_dFC/{run_scripts => run_scripts_sge}/dataset_info.json (100%) rename task_dFC/{run_scripts => run_scripts_sge}/global_configs.json (100%) rename task_dFC/{run_scripts => run_scripts_sge}/methods_config.json (100%) rename task_dFC/{run_scripts => run_scripts_sge}/run_FCS.sh (100%) rename task_dFC/{run_scripts => run_scripts_sge}/run_ML.sh (100%) rename task_dFC/{run_scripts => run_scripts_sge}/run_dFC.sh (100%) rename task_dFC/{run_scripts => run_scripts_sge}/run_fmriprep.sh (100%) rename task_dFC/{run_scripts => run_scripts_sge}/run_nifti_to_roi.sh (100%) rename task_dFC/{run_scripts => run_scripts_sge}/run_report.sh (100%) create mode 100644 task_dFC/run_scripts_slurm/dataset_info.json create mode 100644 task_dFC/run_scripts_slurm/global_configs.json create mode 100644 task_dFC/run_scripts_slurm/methods_config.json create mode 100644 task_dFC/run_scripts_slurm/run_FCS.sh create mode 100644 task_dFC/run_scripts_slurm/run_ML.sh create mode 100644 task_dFC/run_scripts_slurm/run_dFC.sh create mode 100644 task_dFC/run_scripts_slurm/run_fmriprep.sh create mode 100644 task_dFC/run_scripts_slurm/run_nifti_to_roi.sh create mode 100644 task_dFC/run_scripts_slurm/run_report.sh diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index 857353f..693e881 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -617,6 +617,7 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors_to_be_used, + # eigen_solver="lobpcg", ) X_embed = LE.fit_transform(X=affinity_matrix) return X_embed diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts_sge/dataset_info.json similarity index 100% rename from task_dFC/run_scripts/dataset_info.json rename to task_dFC/run_scripts_sge/dataset_info.json diff --git a/task_dFC/run_scripts/global_configs.json b/task_dFC/run_scripts_sge/global_configs.json similarity index 100% rename from task_dFC/run_scripts/global_configs.json rename to task_dFC/run_scripts_sge/global_configs.json diff --git a/task_dFC/run_scripts/methods_config.json b/task_dFC/run_scripts_sge/methods_config.json similarity index 100% rename from task_dFC/run_scripts/methods_config.json rename to task_dFC/run_scripts_sge/methods_config.json diff --git a/task_dFC/run_scripts/run_FCS.sh b/task_dFC/run_scripts_sge/run_FCS.sh similarity index 100% rename from task_dFC/run_scripts/run_FCS.sh rename to task_dFC/run_scripts_sge/run_FCS.sh diff --git a/task_dFC/run_scripts/run_ML.sh b/task_dFC/run_scripts_sge/run_ML.sh similarity index 100% rename from task_dFC/run_scripts/run_ML.sh rename to task_dFC/run_scripts_sge/run_ML.sh diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts_sge/run_dFC.sh similarity index 100% rename from task_dFC/run_scripts/run_dFC.sh rename to task_dFC/run_scripts_sge/run_dFC.sh diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts_sge/run_fmriprep.sh similarity index 100% rename from task_dFC/run_scripts/run_fmriprep.sh rename to task_dFC/run_scripts_sge/run_fmriprep.sh diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts_sge/run_nifti_to_roi.sh similarity index 100% rename from task_dFC/run_scripts/run_nifti_to_roi.sh rename to task_dFC/run_scripts_sge/run_nifti_to_roi.sh diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts_sge/run_report.sh similarity index 100% rename from task_dFC/run_scripts/run_report.sh rename to task_dFC/run_scripts_sge/run_report.sh diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json new file mode 100644 index 0000000..16d775e --- /dev/null +++ b/task_dFC/run_scripts_slurm/dataset_info.json @@ -0,0 +1,22 @@ +{ + "dataset" : "", + "main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}", + "fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output", + "roi_root" : "{main_root}/derivatives/ROI_timeseries", + "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", + "dFC_root" : "{main_root}/derivatives/dFC_assessed", + "ML_root" : "{main_root}/derivatives/ML", + "reports_root" : "{main_root}/derivatives/reports", + "trial_type_label" : "trial_type", + "rest_labels" : ["rest", "Rest"], + "bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz", + "SESSIONS" : [ + "ses-1" + ], + "TASKS" : [ + "task-A" + ], + "RUNS" : { + "task-A": ["run-01", "run-02", "run-03", "run-04", "run-05", "run-06"] + } +} diff --git a/task_dFC/run_scripts_slurm/global_configs.json b/task_dFC/run_scripts_slurm/global_configs.json new file mode 100644 index 0000000..44a524c --- /dev/null +++ b/task_dFC/run_scripts_slurm/global_configs.json @@ -0,0 +1,54 @@ +{ + "DATASET_NAME": "", + "DATASET_ROOT": "/home/mt00/scratch/DATA/task-based/openneuro//", + + "CONTAINER_STORE": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/", + + "SINGULARITY_PATH": "singularity", + + "TEMPLATEFLOW_DIR": "/home/mt00/projects/def-jbpoline/templateflow", + + "SESSIONS": [], + "VISITS": [], + + "BIDS": { + "heudiconv": { + "VERSION": "0.11.6", + "CONTAINER": "heudiconv_{}.sif", + "URL": "" + }, + "validator":{ + "CONTAINER": "bids_validator.sif", + "URL": "" + + } + }, + + "PROC_PIPELINES": { + "mriqc": { + "VERSION": "23.1.0", + "CONTAINER": "mriqc_{}.sif", + "URL": "" + }, + "fmriprep": { + "VERSION": "23.1.3", + "CONTAINER": "fmriprep_{}.sif", + "URL": "" + }, + "freesurfer": { + "VERSION": "7.3.2", + "CONTAINER": "fmriprep_{}.sif", + "URL": "" + } + }, + + "TABULAR": { + "data_dictionary": { + "PATH": "", + "VERSION": "", + "URL": "" + } + }, + + "WORKFLOWS": [] +} diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json new file mode 100644 index 0000000..d4013d4 --- /dev/null +++ b/task_dFC/run_scripts_slurm/methods_config.json @@ -0,0 +1,35 @@ +{ + "params_methods" : { + "W": 12, + "n_overlap": 1.0, + "sw_method": "pear_corr", + "tapered_window": true, + "TF_method": "WTC", + "clstr_base_measure": "SlidingWindow", + "hmm_iter": 20, + "dhmm_obs_state_ratio": 0.666, + "n_states": 5, + "n_subj_clstrs": 10, + "n_jobs": 2, + "verbose": 0, + "backend": "loky", + "normalization": true, + "num_subj": null, + "num_time_point": null + }, + "MEASURES_name_lst" : [ + "SlidingWindow", + "Time-Freq", + "CAP", + "ContinuousHMM", + "Windowless", + "Clustering", + "DiscreteHMM" + ], + "alter_hparams" : [], + "params_multi_analysis" : { + "n_jobs": null, + "verbose": 0, + "backend": "loky" + } +} diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh new file mode 100644 index 0000000..a84c578 --- /dev/null +++ b/task_dFC/run_scripts_slurm/run_FCS.sh @@ -0,0 +1,18 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/fcs_out.txt +#$ -e logs/fcs_err.txt +#$ -l h_vmem=64G +#$ -q origami.q + +DATASET_INFO="./dataset_info.json" +METHODS_CONFIG="./methods_config.json" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \ +--dataset_info $DATASET_INFO \ +--methods_config $METHODS_CONFIG + +conda deactivate diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh new file mode 100644 index 0000000..4ec431a --- /dev/null +++ b/task_dFC/run_scripts_slurm/run_ML.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/ML_out.txt +#$ -e logs/ML_err.txt +#$ -l h_vmem=64G +#$ -q origami.q + +DATASET_INFO="./dataset_info.json" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \ +--dataset_info $DATASET_INFO + +conda deactivate diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh new file mode 100644 index 0000000..124dc1f --- /dev/null +++ b/task_dFC/run_scripts_slurm/run_dFC.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/dfc_out.txt +#$ -e logs/dfc_err.txt +#$ -l h_vmem=32G +#$ -q origami.q + +SUBJECT_LIST="./subj_list.txt" +DATASET_INFO="./dataset_info.json" + +echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" + +SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +echo "Subject ID: $SUBJECT_ID" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \ +--dataset_info $DATASET_INFO \ +--participant_id $SUBJECT_ID + +conda deactivate diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh new file mode 100644 index 0000000..7197245 --- /dev/null +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +#SBATCH --job-name=fmriprep_job # Name of the job +#SBATCH --output=logs/fmriprep_out.log # Standard output log +#SBATCH --error=logs/fmriprep_err.log # Standard error log +#SBATCH --time=24:00:00 # Walltime (24 hours) +#SBATCH --mem=32G # Memory (32 GB) +#SBATCH --cpus-per-task=1 # Number of CPU cores per task +#SBATCH --account=rrg-jbpoline # Account + +source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/nipoppy_env/bin/activate" + +SUBJECT_LIST="./subj_list.txt" + +echo "Number subjects found: $(wc -l < $SUBJECT_LIST)" + +SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST) +echo "Subject ID: $SUBJECT_ID" + +nipoppy run \ +-pipeline fmriprep \ +--participant_id $SUBJECT_ID + +deactivate diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh new file mode 100644 index 0000000..1fff1da --- /dev/null +++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/roi_out.txt +#$ -e logs/roi_err.txt +#$ -l h_vmem=32G +#$ -q origami.q + +SUBJECT_LIST="./subj_list.txt" +DATASET_INFO="./dataset_info.json" + +echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" + +SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +echo "Subject ID: $SUBJECT_ID" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \ +--dataset_info $DATASET_INFO \ +--participant_id $SUBJECT_ID + +conda deactivate diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh new file mode 100644 index 0000000..2a00cc5 --- /dev/null +++ b/task_dFC/run_scripts_slurm/run_report.sh @@ -0,0 +1,18 @@ +#!/bin/sh +# +#$ -cwd +#$ -o logs/report_out.txt +#$ -e logs/report_err.txt +#$ -l h_vmem=16G +#$ -q origami.q + +DATASET_INFO="./dataset_info.json" +SUBJ_LIST="./subj_list.txt" + +source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh +conda activate pydfc +python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \ +--dataset_info $DATASET_INFO \ +--subj_list $SUBJ_LIST + +conda deactivate From 4d2aea2fa6abf0ceda2fad60d19f9ce42ed8fe9e Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 12 Sep 2024 13:01:10 -0400 Subject: [PATCH 112/401] new global config --- .../run_scripts_slurm/global_configs.json | 197 ++++++++++++++---- 1 file changed, 151 insertions(+), 46 deletions(-) diff --git a/task_dFC/run_scripts_slurm/global_configs.json b/task_dFC/run_scripts_slurm/global_configs.json index 44a524c..252968f 100644 --- a/task_dFC/run_scripts_slurm/global_configs.json +++ b/task_dFC/run_scripts_slurm/global_configs.json @@ -1,54 +1,159 @@ { - "DATASET_NAME": "", - "DATASET_ROOT": "/home/mt00/scratch/DATA/task-based/openneuro//", - - "CONTAINER_STORE": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/", - - "SINGULARITY_PATH": "singularity", - - "TEMPLATEFLOW_DIR": "/home/mt00/projects/def-jbpoline/templateflow", - - "SESSIONS": [], - "VISITS": [], - - "BIDS": { - "heudiconv": { - "VERSION": "0.11.6", - "CONTAINER": "heudiconv_{}.sif", - "URL": "" - }, - "validator":{ - "CONTAINER": "bids_validator.sif", - "URL": "" - - } + "DATASET_NAME": "", + "VISIT_IDS": [], + "SESSION_IDS": [], + "SUBSTITUTIONS": { + "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/", + "[[HEUDICONV_HEURISTIC_FILE]]": "", + "[[DCM2BIDS_CONFIG_FILE]]": "", + "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/rrg-jbpoline/mt00/freesurfer/", + "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow" }, - - "PROC_PIPELINES": { - "mriqc": { - "VERSION": "23.1.0", - "CONTAINER": "mriqc_{}.sif", - "URL": "" + "DICOM_DIR_PARTICIPANT_FIRST": true, + "CONTAINER_CONFIG": { + "COMMAND": "apptainer", + "ARGS": [ + "--cleanenv" + ] + }, + "BIDS_PIPELINES": [ + { + "NAME": "heudiconv", + "VERSION": "0.12.2", + "CONTAINER_INFO": { + "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif", + "URI": "docker://nipy/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]" + }, + "STEPS": [ + { + "NAME": "prepare", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json" + }, + { + "NAME": "convert", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json", + "CONTAINER_CONFIG": { + "ARGS": [ + "--bind", + "[[HEUDICONV_HEURISTIC_FILE]]" + ] + }, + "UPDATE_DOUGHNUT": true + } + ] + }, + { + "NAME": "dcm2bids", + "VERSION": "3.1.0", + "CONTAINER_INFO": { + "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif", + "URI": "docker://unfmontreal/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]" + }, + "STEPS": [ + { + "NAME": "prepare", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/dcm2bids_helper-[[PIPELINE_VERSION]].json" + }, + { + "NAME": "convert", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/dcm2bids-[[PIPELINE_VERSION]].json", + "CONTAINER_CONFIG": { + "ARGS": [ + "--bind", + "[[DCM2BIDS_CONFIG_FILE]]" + ] + }, + "UPDATE_DOUGHNUT": true + } + ] }, - "fmriprep": { + { + "NAME": "bidscoin", + "VERSION": "4.3.2", + "STEPS": [ + { + "NAME": "prepare", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidsmapper-[[PIPELINE_VERSION]].json", + "ANALYSIS_LEVEL": "group" + }, + { + "NAME": "edit", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidseditor-[[PIPELINE_VERSION]].json", + "ANALYSIS_LEVEL": "group" + }, + { + "NAME": "convert", + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidscoiner-[[PIPELINE_VERSION]].json", + "ANALYSIS_LEVEL": "participant", + "UPDATE_DOUGHNUT": true + } + ] + } + ], + "PROC_PIPELINES": [ + { + "NAME": "fmriprep", "VERSION": "23.1.3", - "CONTAINER": "fmriprep_{}.sif", - "URL": "" + "CONTAINER_INFO": { + "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif", + "URI": "docker://nipreps/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]" + }, + "CONTAINER_CONFIG": { + "ENV_VARS": { + "TEMPLATEFLOW_HOME": "[[TEMPLATEFLOW_HOME]]" + }, + "ARGS": [ + "--bind", + "[[FREESURFER_LICENSE_FILE]]", + "--bind", + "[[TEMPLATEFLOW_HOME]]" + ] + }, + "STEPS": [ + { + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json" + } + ], + "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json" }, - "freesurfer": { + { + "NAME": "freesurfer", "VERSION": "7.3.2", - "CONTAINER": "fmriprep_{}.sif", - "URL": "" - } - }, - - "TABULAR": { - "data_dictionary": { - "PATH": "", - "VERSION": "", - "URL": "" + "DESCRIPTION": "Freesurfer version associated with fMRIPrep 23.1.3", + "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json" + }, + { + "NAME": "mriqc", + "VERSION": "23.1.0", + "CONTAINER_INFO": { + "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif", + "URI": "docker://nipreps/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]" + }, + "CONTAINER_CONFIG": { + "ENV_VARS": { + "TEMPLATEFLOW_HOME": "[[TEMPLATEFLOW_HOME]]" + }, + "ARGS": [ + "--bind", + "[[TEMPLATEFLOW_HOME]]" + ] + }, + "STEPS": [ + { + "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json", + "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json" + } + ], + "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json" } - }, - - "WORKFLOWS": [] + ], + "CUSTOM": {} } From a8a36ced23817184859ec88e04f8d00de85dfc30 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 12 Sep 2024 13:07:22 -0400 Subject: [PATCH 113/401] minor change --- .../run_scripts_slurm/{global_configs.json => global_config.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename task_dFC/run_scripts_slurm/{global_configs.json => global_config.json} (100%) diff --git a/task_dFC/run_scripts_slurm/global_configs.json b/task_dFC/run_scripts_slurm/global_config.json similarity index 100% rename from task_dFC/run_scripts_slurm/global_configs.json rename to task_dFC/run_scripts_slurm/global_config.json From aa6b724eab15221d63516ffe171982e5f536b90a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 12 Sep 2024 13:36:18 -0400 Subject: [PATCH 114/401] minor change --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 7197245..3f2aa99 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -18,7 +18,8 @@ SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST) echo "Subject ID: $SUBJECT_ID" nipoppy run \ --pipeline fmriprep \ +--pipeline fmriprep \ +--dataset-root "$(dirname "$(pwd)")" \ --participant_id $SUBJECT_ID deactivate From 43686e3b8b99f9a8c06626b1065432de6c29b30c Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 12 Sep 2024 13:44:14 -0400 Subject: [PATCH 115/401] minor fix --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 3f2aa99..1e90631 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -20,6 +20,6 @@ echo "Subject ID: $SUBJECT_ID" nipoppy run \ --pipeline fmriprep \ --dataset-root "$(dirname "$(pwd)")" \ ---participant_id $SUBJECT_ID +--participant-id $SUBJECT_ID deactivate From 5fe74669c4513070883c11ff9cf4ec8485069b93 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 19 Sep 2024 12:42:52 -0400 Subject: [PATCH 116/401] add simul to slurm --- .../run_simulator.sh | 0 simul_dFC/run_scripts_slurm/run_simulator.sh | 17 +++++++++++++++++ 2 files changed, 17 insertions(+) rename simul_dFC/{run_scripts => run_scripts_sge}/run_simulator.sh (100%) create mode 100644 simul_dFC/run_scripts_slurm/run_simulator.sh diff --git a/simul_dFC/run_scripts/run_simulator.sh b/simul_dFC/run_scripts_sge/run_simulator.sh similarity index 100% rename from simul_dFC/run_scripts/run_simulator.sh rename to simul_dFC/run_scripts_sge/run_simulator.sh diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh new file mode 100644 index 0000000..b363a7f --- /dev/null +++ b/simul_dFC/run_scripts_slurm/run_simulator.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +#SBATCH --job-name=simul_dfc_job # Optional: Name of your job +#SBATCH --output=logs/simul_out.txt # Standard output log +#SBATCH --error=logs/simul_err.txt # Standard error log +#SBATCH --account=rrg-jbpoline # Account +#SBATCH --mem=8G # Memory request per node +#SBATCH --array=1-200 # Task array specification + +# Activate virtual environment +source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/pydfc/bin/activate" + +# Run Python script +python "/home/mt00/projects/rrg-jbpoline/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" + +# Deactivate environment +deactivate From 3e0ae70237864a20c7a49ce94328b4e2dc936def Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 19 Sep 2024 17:41:27 -0400 Subject: [PATCH 117/401] add PCA to embedding --- pydfc/ml_utils.py | 365 +++++++++++++++++++----------------- task_dFC/ML.py | 2 + task_dFC/generate_report.py | 309 +++++++++++++++--------------- 3 files changed, 352 insertions(+), 324 deletions(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index 693e881..e7cce64 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -819,6 +819,11 @@ def embed_dFC_features( LE_embedding_method: "concat+embed" or "embed+procrustes" """ + # make a copy of the data + X_train = X_train.copy() + if X_test is not None: + X_test = X_test.copy() + if embedding == "PCA": # if n_components is not specified, use 95% of the variance if n_components == "auto": @@ -1077,54 +1082,7 @@ def task_presence_classification( ) ) - # embed dFC features - X_train, X_test = embed_dFC_features( - train_subjects=train_subjects, - test_subjects=test_subjects, - X_train=X_train, - X_test=X_test, - y_train=y_train, - y_test=y_test, - subj_label_train=subj_label_train, - subj_label_test=subj_label_test, - embedding="LE", - n_components="auto", - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) - - # task presence classification - - print("task presence classification ...") - - # logistic regression - log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test) - - # KNN - KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test) - - # # Random Forest - # RF_RESULT = random_forest_classify( - # X_train, y_train, X_test, y_test - # ) - - # # Gradient Boosting - # GBT_RESULT = gradient_boosting_classify( - # X_train, y_train, X_test, y_test - # ) - ML_RESULT = {} - for key in log_reg_RESULT: - ML_RESULT[key] = log_reg_RESULT[key] - for key in KNN_RESULT: - ML_RESULT[key] = KNN_RESULT[key] - # for key in RF_RESULT: - # ML_RESULT[key] = RF_RESULT[key] - # for key in GBT_RESULT: - # ML_RESULT[key] = GBT_RESULT[key] - - # measure pred score on each subj - ML_scores = { "subj_id": list(), "group": list(), @@ -1135,42 +1093,94 @@ def task_presence_classification( "KNN accuracy": list(), # "Random Forest accuracy": list(), # "Gradient Boosting accuracy": list(), + "embedding": list(), } - log_reg = log_reg_RESULT["log_reg_model"] - KNN = KNN_RESULT["KNN_model"] - # RF = RF_RESULT["RF_model"] - # GBT = GBT_RESULT["GB_model"] - - for subj in SUBJECTS: - ML_scores["subj_id"].append(subj) - if subj in train_subjects: - ML_scores["group"].append("train") - features = X_train[subj_label_train == subj, :] - target = y_train[subj_label_train == subj] - elif subj in test_subjects: - ML_scores["group"].append("test") - features = X_test[subj_label_test == subj, :] - target = y_test[subj_label_test == subj] - - pred_lr = log_reg.predict(features) - pred_KNN = KNN.predict(features) - # pred_RF = RF.predict(features) - # pred_GBT = GBT.predict(features) - - ML_scores["Logistic regression accuracy"].append( - balanced_accuracy_score(target, pred_lr) + for embedding in ["PCA", "LE"]: + # embed dFC features + X_train_embedded, X_test_embedded = embed_dFC_features( + train_subjects=train_subjects, + test_subjects=test_subjects, + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test, + subj_label_train=subj_label_train, + subj_label_test=subj_label_test, + embedding=embedding, + n_components="auto", + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", ) - ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) - # ML_scores["Random Forest accuracy"].append( - # balanced_accuracy_score(target, pred_RF) + + # task presence classification + + print("task presence classification ...") + + # logistic regression + log_reg_RESULT = logistic_regression_classify( + X_train_embedded, y_train, X_test_embedded, y_test + ) + + # KNN + KNN_RESULT = KNN_classify(X_train_embedded, y_train, X_test_embedded, y_test) + + # # Random Forest + # RF_RESULT = random_forest_classify( + # X_train_embedded, y_train, X_test_embedded, y_test # ) - # ML_scores["Gradient Boosting accuracy"].append( - # balanced_accuracy_score(target, pred_GBT) + + # # Gradient Boosting + # GBT_RESULT = gradient_boosting_classify( + # X_train_embedded, y_train, X_test_embedded, y_test # ) - ML_scores["task"].append(task) - ML_scores["run"].append(run) - ML_scores["dFC method"].append(measure_name) + ML_RESULT[embedding] = {} + for key in log_reg_RESULT: + ML_RESULT[embedding][key] = log_reg_RESULT[key] + for key in KNN_RESULT: + ML_RESULT[embedding][key] = KNN_RESULT[key] + # for key in RF_RESULT: + # ML_RESULT[embedding][key] = RF_RESULT[key] + # for key in GBT_RESULT: + # ML_RESULT[embedding][key] = GBT_RESULT[key] + + # measure pred score on each subj + log_reg = log_reg_RESULT["log_reg_model"] + KNN = KNN_RESULT["KNN_model"] + # RF = RF_RESULT["RF_model"] + # GBT = GBT_RESULT["GB_model"] + + for subj in SUBJECTS: + ML_scores["subj_id"].append(subj) + if subj in train_subjects: + ML_scores["group"].append("train") + features = X_train_embedded[subj_label_train == subj, :] + target = y_train[subj_label_train == subj] + elif subj in test_subjects: + ML_scores["group"].append("test") + features = X_test_embedded[subj_label_test == subj, :] + target = y_test[subj_label_test == subj] + + pred_lr = log_reg.predict(features) + pred_KNN = KNN.predict(features) + # pred_RF = RF.predict(features) + # pred_GBT = GBT.predict(features) + + ML_scores["Logistic regression accuracy"].append( + balanced_accuracy_score(target, pred_lr) + ) + ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN)) + # ML_scores["Random Forest accuracy"].append( + # balanced_accuracy_score(target, pred_RF) + # ) + # ML_scores["Gradient Boosting accuracy"].append( + # balanced_accuracy_score(target, pred_GBT) + # ) + + ML_scores["task"].append(task) + ML_scores["run"].append(run) + ML_scores["dFC method"].append(measure_name) + ML_scores["embedding"].append(embedding) return ML_RESULT, ML_scores @@ -1214,49 +1224,7 @@ def task_presence_clustering( normalize_dFC=normalize_dFC, ) - # embed dFC features - X, _ = embed_dFC_features( - train_subjects=SUBJECTS, - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=subj_label, - subj_label_test=None, - embedding="LE", - n_components="auto", - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) - - # clustering - # apply kmeans clustering to dFC features - - n_clusters = 2 # corresponding to task and rest - - scaler = StandardScaler() - X_normalized = scaler.fit_transform(X) - kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) - labels_pred = kmeans.fit_predict(X_normalized) - - # ARI score - print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") - - # # visualize clustering centroids - # centroids = kmeans.cluster_centers_ - # centroids = pca.inverse_transform(centroids) - # centroids = scaler.inverse_transform(centroids) - # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) - # centroids_mat = dFC_vec2mat(centroids, n_regions) - - clustering_RESULTS = { - "StandardScaler": scaler, - "kmeans": kmeans, - "ARI": adjusted_rand_score(y, labels_pred), - # "centroids": centroids_mat, - } - + clustering_RESULTS = {} clustering_scores = { "subj_id": list(), "task": list(), @@ -1265,22 +1233,69 @@ def task_presence_clustering( "Kmeans ARI": list(), "SI": list(), } - for subj in SUBJECTS: - clustering_scores["subj_id"].append(subj) - features = X[subj_label == subj, :] - target = y[subj_label == subj] + for embedding in ["PCA", "LE"]: + # embed dFC features + X_embedded, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding=embedding, + n_components="auto", + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + + # clustering + # apply kmeans clustering to dFC features + + n_clusters = 2 # corresponding to task and rest + + scaler = StandardScaler() + X_normalized = scaler.fit_transform(X_embedded) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) + labels_pred = kmeans.fit_predict(X_normalized) + + # ARI score + print(f"ARI score: {adjusted_rand_score(y, labels_pred)}") + + # # visualize clustering centroids + # centroids = kmeans.cluster_centers_ + # centroids = pca.inverse_transform(centroids) + # centroids = scaler.inverse_transform(centroids) + # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + # centroids_mat = dFC_vec2mat(centroids, n_regions) - features_normalized = scaler.transform(features) - pred_kmeans = kmeans.predict(features_normalized) + clustering_RESULTS[embedding] = { + "StandardScaler": scaler, + "kmeans": kmeans, + "ARI": adjusted_rand_score(y, labels_pred), + # "centroids": centroids_mat, + } - clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans)) + for subj in SUBJECTS: + clustering_scores["subj_id"].append(subj) + features = X_embedded[subj_label == subj, :] + target = y[subj_label == subj] - # silhouette score in terms of separability of original labels, not the clustering labels - clustering_scores["SI"].append(silhouette_score(features, target)) + features_normalized = scaler.transform(features) + pred_kmeans = kmeans.predict(features_normalized) - clustering_scores["task"].append(task) - clustering_scores["run"].append(run) - clustering_scores["dFC method"].append(measure_name) + clustering_scores["Kmeans ARI"].append( + adjusted_rand_score(target, pred_kmeans) + ) + + # silhouette score in terms of separability of original labels, not the clustering labels + clustering_scores["SI"].append(silhouette_score(features, target)) + + clustering_scores["task"].append(task) + clustering_scores["run"].append(run) + clustering_scores["dFC method"].append(measure_name) + clustering_scores["embedding"].append(embedding) return clustering_RESULTS, clustering_scores @@ -1353,47 +1368,49 @@ def task_paradigm_clustering( y = y[idx] subj_label = subj_label[idx] - # embed dFC features - X_embed, _ = embed_dFC_features( - train_subjects=SUBJECTS, - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=subj_label, - subj_label_test=None, - embedding="LE", - n_components="auto", - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) + task_paradigm_clstr_RESULTS = {} + for embedding in ["PCA", "LE"]: + # embed dFC features + X_embed, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding=embedding, + n_components="auto", + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) - # clustering - # apply kmeans clustering to dFC features - - n_clusters = len(TASKS) # corresponding to task paradigms - - scaler = StandardScaler() - X_normalized = scaler.fit_transform(X_embed) - kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) - labels_pred = kmeans.fit_predict(X_normalized) - - # # visualize clustering centroids - # centroids = kmeans.cluster_centers_ - # centroids = pca.inverse_transform(centroids) - # centroids = scaler.inverse_transform(centroids) - # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) - # centroids_mat = dFC_vec2mat(centroids, n_regions) - - task_paradigm_clstr_RESULTS = { - "dFC_method": measure_name, - "StandardScaler": scaler, - "kmeans": kmeans, - "ARI": adjusted_rand_score(y, labels_pred), - "SI": silhouette_score(X_normalized, y), - # "centroids": centroids_mat, - "task_paradigms": TASKS, - } + # clustering + # apply kmeans clustering to dFC features + + n_clusters = len(TASKS) # corresponding to task paradigms + + scaler = StandardScaler() + X_normalized = scaler.fit_transform(X_embed) + kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5) + labels_pred = kmeans.fit_predict(X_normalized) + + # # visualize clustering centroids + # centroids = kmeans.cluster_centers_ + # centroids = pca.inverse_transform(centroids) + # centroids = scaler.inverse_transform(centroids) + # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2) + # centroids_mat = dFC_vec2mat(centroids, n_regions) + + task_paradigm_clstr_RESULTS[embedding] = { + "dFC_method": measure_name, + "StandardScaler": scaler, + "kmeans": kmeans, + "ARI": adjusted_rand_score(y, labels_pred), + "SI": silhouette_score(X_normalized, y), + # "centroids": centroids_mat, + "task_paradigms": TASKS, + } return task_paradigm_clstr_RESULTS diff --git a/task_dFC/ML.py b/task_dFC/ML.py index a792130..9a473dd 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -72,6 +72,7 @@ def run_classification( "KNN accuracy": list(), # "Random Forest accuracy": list(), # "Gradient Boosting accuracy": list(), + "embedding": list(), } ML_RESULT = {} @@ -129,6 +130,7 @@ def run_clustering( "dFC method": list(), "Kmeans ARI": list(), "SI": list(), + "embedding": list(), } clustering_RESULTS = {} diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py index 4c99d88..301cb5e 100644 --- a/task_dFC/generate_report.py +++ b/task_dFC/generate_report.py @@ -452,7 +452,13 @@ def plot_dFC_matrices( def plot_ML_results( - ML_root, output_root, task, run=None, session=None, ML_algorithm="Random Forest" + ML_root, + output_root, + task, + run=None, + session=None, + ML_algorithm="Random Forest", + embedding="PCA", ): """ Plot the ML results for a given task, run and session. @@ -464,6 +470,7 @@ def plot_ML_results( run: int, run number session: str, session name ML_algorithm: str, ML algorithm name (default: Random Forest, other options: Logistic regression, KNN, Gradient Boosting) + embedding: str, embedding method (default: PCA, other options: LE) """ # the ML_scores files are saved as ML_scores_classify_{dFC_id}.npy # find all the ML_scores files in the directory @@ -493,10 +500,13 @@ def plot_ML_results( if run is not None: dataframe = dataframe[dataframe["run"] == run] + dataframe = dataframe[dataframe["task"] == task] + dataframe = dataframe[dataframe["embedding"] == embedding] + plt.figure(figsize=(10, 5)) g = sns.pointplot( - data=dataframe[dataframe["task"] == task], + data=dataframe, x="dFC method", y=f"{ML_algorithm} accuracy", hue="group", @@ -532,7 +542,7 @@ def plot_ML_results( if run is None: plt.savefig( - f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}.{save_fig_format}", + f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -540,7 +550,7 @@ def plot_ML_results( ) else: plt.savefig( - f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}.{save_fig_format}", + f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -550,7 +560,9 @@ def plot_ML_results( plt.close() -def plot_clustering_results(ML_root, output_root, task, run=None, session=None): +def plot_clustering_results( + ML_root, output_root, task, run=None, session=None, embedding="PCA" +): """ Plot the clustering results for a given task, run and session. parameters: @@ -560,6 +572,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): task: str, task name run: int, run number session: str, session name + embedding: str, embedding method (default: PCA, other options: LE) """ # the clustering_scores files are saved as clustering_scores_{dFC_id}.npy # find all the clustering_scores files in the directory @@ -593,10 +606,13 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): if run is not None: dataframe = dataframe[dataframe["run"] == run] + dataframe = dataframe[dataframe["task"] == task] + dataframe = dataframe[dataframe["embedding"] == embedding] + # plot ARI score plt.figure(figsize=(10, 5)) g = sns.pointplot( - data=dataframe[dataframe["task"] == task], + data=dataframe, x="dFC method", y="Kmeans ARI", errorbar="sd", @@ -621,7 +637,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): if run is None: plt.savefig( - f"{output_dir}/clustering_results_ARI_{task}.{save_fig_format}", + f"{output_dir}/clustering_results_ARI_{task}_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -629,7 +645,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): ) else: plt.savefig( - f"{output_dir}/clustering_results_ARI_{task}_{run}.{save_fig_format}", + f"{output_dir}/clustering_results_ARI_{task}_{run}_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -641,7 +657,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): # plot SI score plt.figure(figsize=(10, 5)) g = sns.pointplot( - data=dataframe[dataframe["task"] == task], + data=dataframe, x="dFC method", y="SI", errorbar="sd", @@ -664,7 +680,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): if run is None: plt.savefig( - f"{output_dir}/clustering_results_SI_{task}.{save_fig_format}", + f"{output_dir}/clustering_results_SI_{task}_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -672,7 +688,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None): ) else: plt.savefig( - f"{output_dir}/clustering_results_SI_{task}_{run}.{save_fig_format}", + f"{output_dir}/clustering_results_SI_{task}_{run}_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -686,6 +702,7 @@ def plot_paradigm_clustering_score( ML_root, output_root, session=None, + embedding="PCA", ): """ Plot the clustering results for a given task, run and session. @@ -696,6 +713,7 @@ def plot_paradigm_clustering_score( task: str, task name run: int, run number session: str, session name + embedding: str, embedding method (default: PCA, other options: LE) """ # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy # find all the paradigm_clustering_RESULTS files in the directory @@ -720,13 +738,13 @@ def plot_paradigm_clustering_score( f"{input_dir}/{result_file}", allow_pickle="TRUE" ).item() paradigm_clustering_RESULTS["dFC method"].append( - paradigm_clustering_RESULTS_new["dFC_method"] + paradigm_clustering_RESULTS_new[embedding]["dFC_method"] ) paradigm_clustering_RESULTS["ARI score"].append( - paradigm_clustering_RESULTS_new["ARI"] + paradigm_clustering_RESULTS_new[embedding]["ARI"] ) paradigm_clustering_RESULTS["SI score"].append( - paradigm_clustering_RESULTS_new["SI"] + paradigm_clustering_RESULTS_new[embedding]["SI"] ) sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0}) @@ -764,7 +782,7 @@ def plot_paradigm_clustering_score( os.makedirs(output_dir) plt.savefig( - f"{output_dir}/paradigm_clustering_results_ARI.{save_fig_format}", + f"{output_dir}/paradigm_clustering_results_ARI_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -801,7 +819,7 @@ def plot_paradigm_clustering_score( os.makedirs(output_dir) plt.savefig( - f"{output_dir}/paradigm_clustering_results_SI.{save_fig_format}", + f"{output_dir}/paradigm_clustering_results_SI_{embedding}.{save_fig_format}", dpi=fig_dpi, bbox_inches=fig_bbox_inches, pad_inches=fig_pad, @@ -1225,45 +1243,41 @@ def create_html_report_group_results( else: classification_dir = f"{group_dir}/classification" - # display Random Forest classification results - file.write("

KNN

\n") - if run is None: - classification_img = ( - f"{classification_dir}/ML_results_classify_KNN_{task}.png" + for embedding in ["PCA", "LE"]: + file.write(f"

{embedding}

\n") + # display KNN classification results + file.write("

KNN

\n") + if run is None: + classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{embedding}.png" + else: + classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{run}_{embedding}.png" + img = plt.imread(classification_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + classification_img = classification_img.replace(group_dir, ".") + file.write( + f"Classification results\n" ) - else: - classification_img = ( - f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png" - ) - img = plt.imread(classification_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - classification_img = classification_img.replace(group_dir, ".") - file.write( - f"Classification results\n" - ) - # display Logistic regression classification results - file.write("

Logistic Regression

\n") - if run is None: - classification_img = ( - f"{classification_dir}/ML_results_classify_LogReg_{task}.png" + # display Logistic regression classification results + file.write("

Logistic Regression

\n") + if run is None: + classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{embedding}.png" + else: + classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}_{embedding}.png" + img = plt.imread(classification_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + classification_img = classification_img.replace(group_dir, ".") + file.write( + f"Classification results\n" ) - else: - classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png" - img = plt.imread(classification_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - classification_img = classification_img.replace(group_dir, ".") - file.write( - f"Classification results\n" - ) - file.write("
\n") + file.write("
\n") # clustering results img_height = 300 @@ -1281,43 +1295,41 @@ def create_html_report_group_results( else: clustering_dir = f"{group_dir}/clustering" - # display clustering ARI results - if run is None: - clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}.png" - else: - clustering_img = ( - f"{clustering_dir}/clustering_results_ARI_{task}_{run}.png" + for embedding in ["PCA", "LE"]: + file.write(f"

{embedding}

\n") + # display clustering ARI results + if run is None: + clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{embedding}.png" + else: + clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{run}_{embedding}.png" + img = plt.imread(clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + clustering_img = clustering_img.replace(group_dir, ".") + file.write( + f"Clustering results\n" ) - img = plt.imread(clustering_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - clustering_img = clustering_img.replace(group_dir, ".") - file.write( - f"Clustering results\n" - ) - file.write("
\n") - - # display clustering SI results - if run is None: - clustering_img = f"{clustering_dir}/clustering_results_SI_{task}.png" - else: - clustering_img = ( - f"{clustering_dir}/clustering_results_SI_{task}_{run}.png" + file.write("
\n") + + # display clustering SI results + if run is None: + clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{embedding}.png" + else: + clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{run}_{embedding}.png" + img = plt.imread(clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + clustering_img = clustering_img.replace(group_dir, ".") + file.write( + f"Clustering results\n" ) - img = plt.imread(clustering_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - clustering_img = clustering_img.replace(group_dir, ".") - file.write( - f"Clustering results\n" - ) - file.write("
\n") + file.write("
\n") # paradigm clustering results file.write("

Paradigm Clustering Results

\n") @@ -1332,38 +1344,38 @@ def create_html_report_group_results( # display paradigm clustering ARI scores img_height = 300 file.write("

Paradigm Clustering ARI Scores

\n") - paradigm_clustering_img = ( - f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI.png" - ) - img = plt.imread(paradigm_clustering_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".") - file.write( - f"Paradigm clustering results\n" - ) + for embedding in ["PCA", "LE"]: + file.write(f"

{embedding}

\n") + paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI_{embedding}.png" + img = plt.imread(paradigm_clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".") + file.write( + f"Paradigm clustering results\n" + ) - file.write("
\n") + file.write("
\n") # display paradigm clustering SI scores img_height = 300 file.write("

Paradigm Clustering SI Scores

\n") - paradigm_clustering_img = ( - f"{paradigm_clustering_dir}/paradigm_clustering_results_SI.png" - ) - img = plt.imread(paradigm_clustering_img) - height, width, _ = img.shape - # change the width so that height equals img_height - width = int(width * img_height / height) - # replace the path to the image with a relative path - paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".") - file.write( - f"Paradigm clustering results\n" - ) + for embedding in ["PCA", "LE"]: + file.write(f"

{embedding}

\n") + paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_SI_{embedding}.png" + img = plt.imread(paradigm_clustering_img) + height, width, _ = img.shape + # change the width so that height equals img_height + width = int(width * img_height / height) + # replace the path to the image with a relative path + paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".") + file.write( + f"Paradigm clustering results\n" + ) - file.write("
\n") + file.write("
\n") # # display paradigm clustering centroids # img_height = 300 @@ -1590,14 +1602,16 @@ def create_html_report_group_results( except Exception as e: print(f"Error in plotting task presence features: {e}") - try: - plot_paradigm_clustering_score( - ML_root=ML_root, - output_root=reports_root, - session=session, - ) - except Exception as e: - print(f"Error in plotting paradigm clustering scores: {e}") + for embedding in ["PCA", "LE"]: + try: + plot_paradigm_clustering_score( + ML_root=ML_root, + output_root=reports_root, + session=session, + embedding=embedding, + ) + except Exception as e: + print(f"Error in plotting paradigm clustering scores: {e}") # try: # plot_paradigm_clstr_centroids( @@ -1610,38 +1624,33 @@ def create_html_report_group_results( for task in TASKS: for run in RUNS[task]: - try: - plot_ML_results( - ML_root=ML_root, - output_root=reports_root, - task=task, - run=run, - session=session, - ML_algorithm="KNN", - ) - except Exception as e: - print(f"Error in plotting ML results for KNN: {e}") - try: - plot_ML_results( - ML_root=ML_root, - output_root=reports_root, - task=task, - run=run, - session=session, - ML_algorithm="Logistic regression", - ) - except Exception as e: - print(f"Error in plotting ML results for Logistic regression: {e}") - try: - plot_clustering_results( - ML_root=ML_root, - output_root=reports_root, - task=task, - run=run, - session=session, - ) - except Exception as e: - print(f"Error in plotting clustering results: {e}") + for embedding in ["PCA", "LE"]: + for ML_algorithm in ["KNN", "Logistic regression"]: + try: + plot_ML_results( + ML_root=ML_root, + output_root=reports_root, + task=task, + run=run, + session=session, + ML_algorithm=ML_algorithm, + embedding=embedding, + ) + except Exception as e: + print( + f"Error in plotting ML results for {ML_algorithm} and {embedding}: {e}" + ) + try: + plot_clustering_results( + ML_root=ML_root, + output_root=reports_root, + task=task, + run=run, + session=session, + embedding=embedding, + ) + except Exception as e: + print(f"Error in plotting clustering results: {e}") # create html report try: From dbe1def78aa6bedd5a9defa642247f8285d0ffbe Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 09:53:05 -0400 Subject: [PATCH 118/401] minor change --- pydfc/ml_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index e7cce64..b3d9f9f 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -1232,6 +1232,7 @@ def task_presence_clustering( "dFC method": list(), "Kmeans ARI": list(), "SI": list(), + "embedding": list(), } for embedding in ["PCA", "LE"]: # embed dFC features From 1dfc2f69875e6aea718e946b4c59786ee4d13b44 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 12:25:22 -0400 Subject: [PATCH 119/401] minor change --- simul_dFC/run_scripts_slurm/run_simulator.sh | 6 +++--- task_dFC/run_scripts_slurm/dataset_info.json | 2 +- task_dFC/run_scripts_slurm/global_config.json | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh index b363a7f..eaf0194 100644 --- a/simul_dFC/run_scripts_slurm/run_simulator.sh +++ b/simul_dFC/run_scripts_slurm/run_simulator.sh @@ -3,15 +3,15 @@ #SBATCH --job-name=simul_dfc_job # Optional: Name of your job #SBATCH --output=logs/simul_out.txt # Standard output log #SBATCH --error=logs/simul_err.txt # Standard error log -#SBATCH --account=rrg-jbpoline # Account +#SBATCH --account=def-jbpoline # Account #SBATCH --mem=8G # Memory request per node #SBATCH --array=1-200 # Task array specification # Activate virtual environment -source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/pydfc/bin/activate" +source "/home/mt00/pydfc/bin/activate" # Run Python script -python "/home/mt00/projects/rrg-jbpoline/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" +python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" # Deactivate environment deactivate diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json index 16d775e..e466511 100644 --- a/task_dFC/run_scripts_slurm/dataset_info.json +++ b/task_dFC/run_scripts_slurm/dataset_info.json @@ -1,6 +1,6 @@ { "dataset" : "", - "main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}", + "main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}", "fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output", "roi_root" : "{main_root}/derivatives/ROI_timeseries", "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", diff --git a/task_dFC/run_scripts_slurm/global_config.json b/task_dFC/run_scripts_slurm/global_config.json index 252968f..0e0681c 100644 --- a/task_dFC/run_scripts_slurm/global_config.json +++ b/task_dFC/run_scripts_slurm/global_config.json @@ -6,7 +6,7 @@ "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/", "[[HEUDICONV_HEURISTIC_FILE]]": "", "[[DCM2BIDS_CONFIG_FILE]]": "", - "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/rrg-jbpoline/mt00/freesurfer/", + "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/", "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow" }, "DICOM_DIR_PARTICIPANT_FIRST": true, From 8262e842b6ece752c477dd7bbed9feb29557c8dd Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 12:30:55 -0400 Subject: [PATCH 120/401] minor change --- simul_dFC/run_scripts_slurm/run_simulator.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh index eaf0194..d17f3bd 100644 --- a/simul_dFC/run_scripts_slurm/run_simulator.sh +++ b/simul_dFC/run_scripts_slurm/run_simulator.sh @@ -8,7 +8,7 @@ #SBATCH --array=1-200 # Task array specification # Activate virtual environment -source "/home/mt00/pydfc/bin/activate" +source "/home/mt00/venvs/pydfc/bin/activate" # Run Python script python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" From 9bf88087fc3cb0a1034cf0495b4f64c3dd664e4b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 12:52:11 -0400 Subject: [PATCH 121/401] slurm changes --- simul_dFC/run_scripts_slurm/run_simulator.sh | 1 + task_dFC/run_scripts_slurm/run_FCS.sh | 20 ++++++++++--------- task_dFC/run_scripts_slurm/run_ML.sh | 20 ++++++++++--------- task_dFC/run_scripts_slurm/run_dFC.sh | 20 ++++++++++--------- task_dFC/run_scripts_slurm/run_fmriprep.sh | 5 ++--- .../run_scripts_slurm/run_nifti_to_roi.sh | 20 ++++++++++--------- task_dFC/run_scripts_slurm/run_report.sh | 20 ++++++++++--------- 7 files changed, 58 insertions(+), 48 deletions(-) diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh index d17f3bd..f7f8998 100644 --- a/simul_dFC/run_scripts_slurm/run_simulator.sh +++ b/simul_dFC/run_scripts_slurm/run_simulator.sh @@ -4,6 +4,7 @@ #SBATCH --output=logs/simul_out.txt # Standard output log #SBATCH --error=logs/simul_err.txt # Standard error log #SBATCH --account=def-jbpoline # Account +#SBATCH --time=24:00:00 # Walltime for each task (24 hours) #SBATCH --mem=8G # Memory request per node #SBATCH --array=1-200 # Task array specification diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh index a84c578..b4d3b52 100644 --- a/task_dFC/run_scripts_slurm/run_FCS.sh +++ b/task_dFC/run_scripts_slurm/run_FCS.sh @@ -1,18 +1,20 @@ #!/bin/sh # -#$ -cwd -#$ -o logs/fcs_out.txt -#$ -e logs/fcs_err.txt -#$ -l h_vmem=64G -#$ -q origami.q +#SBATCH --job-name=fit_fcs_job # Optional: Name of your job +#SBATCH --output=logs/fcs_out.txt # Standard output log +#SBATCH --error=logs/fcs_err.txt # Standard error log +#SBATCH --account=def-jbpoline # Account +#SBATCH --time=96:00:00 # Walltime for each task (96 hours) +#SBATCH --mem=64G # Memory request per node DATASET_INFO="./dataset_info.json" METHODS_CONFIG="./methods_config.json" -source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh -conda activate pydfc -python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \ +# Activate virtual environment +source "/home/mt00/venvs/pydfc/bin/activate" + +python "/home/mt00/pydfc/dFC/task_dFC/FCS_estimate.py" \ --dataset_info $DATASET_INFO \ --methods_config $METHODS_CONFIG -conda deactivate +deactivate diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh index 4ec431a..ff4a52b 100644 --- a/task_dFC/run_scripts_slurm/run_ML.sh +++ b/task_dFC/run_scripts_slurm/run_ML.sh @@ -1,16 +1,18 @@ #!/bin/sh # -#$ -cwd -#$ -o logs/ML_out.txt -#$ -e logs/ML_err.txt -#$ -l h_vmem=64G -#$ -q origami.q +#SBATCH --job-name=ML_job # Optional: Name of your job +#SBATCH --output=logs/ML_out.txt # Standard output log +#SBATCH --error=logs/ML_err.txt # Standard error log +#SBATCH --account=def-jbpoline # Account +#SBATCH --time=72:00:00 # Walltime for each task (72 hours) +#SBATCH --mem=64G # Memory request per node DATASET_INFO="./dataset_info.json" -source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh -conda activate pydfc -python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \ +# Activate virtual environment +source "/home/mt00/venvs/pydfc/bin/activate" + +python "/home/mt00/pydfc/dFC/task_dFC/ML.py" \ --dataset_info $DATASET_INFO -conda deactivate +deactivate diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh index 124dc1f..84edbb9 100644 --- a/task_dFC/run_scripts_slurm/run_dFC.sh +++ b/task_dFC/run_scripts_slurm/run_dFC.sh @@ -1,10 +1,11 @@ #!/bin/sh # -#$ -cwd -#$ -o logs/dfc_out.txt -#$ -e logs/dfc_err.txt -#$ -l h_vmem=32G -#$ -q origami.q +#SBATCH --job-name=assess_dfc_job # Optional: Name of your job +#SBATCH --output=logs/dfc_out.txt # Standard output log +#SBATCH --error=logs/dfc_err.txt # Standard error log +#SBATCH --account=def-jbpoline # Account +#SBATCH --time=24:00:00 # Walltime for each task (24 hours) +#SBATCH --mem=32G # Memory request per node SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" @@ -14,10 +15,11 @@ echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` echo "Subject ID: $SUBJECT_ID" -source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh -conda activate pydfc -python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \ +# Activate virtual environment +source "/home/mt00/venvs/pydfc/bin/activate" + +python "/home/mt00/pydfc/dFC/task_dFC/dFC_assessment.py" \ --dataset_info $DATASET_INFO \ --participant_id $SUBJECT_ID -conda deactivate +deactivate diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 1e90631..ed9306f 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -5,10 +5,9 @@ #SBATCH --error=logs/fmriprep_err.log # Standard error log #SBATCH --time=24:00:00 # Walltime (24 hours) #SBATCH --mem=32G # Memory (32 GB) -#SBATCH --cpus-per-task=1 # Number of CPU cores per task -#SBATCH --account=rrg-jbpoline # Account +#SBATCH --account=def-jbpoline # Account -source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/nipoppy_env/bin/activate" +source "/home/mt00/venvs/nipoppy_env/bin/activate" SUBJECT_LIST="./subj_list.txt" diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh index 1fff1da..0462e86 100644 --- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh +++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh @@ -1,10 +1,11 @@ #!/bin/sh # -#$ -cwd -#$ -o logs/roi_out.txt -#$ -e logs/roi_err.txt -#$ -l h_vmem=32G -#$ -q origami.q +#SBATCH --job-name=extract_roi_job # Optional: Name of your job +#SBATCH --output=logs/roi_out.txt # Standard output log +#SBATCH --error=logs/roi_err.txt # Standard error log +#SBATCH --account=def-jbpoline # Account +#SBATCH --time=24:00:00 # Walltime for each task (24 hours) +#SBATCH --mem=32G # Memory request per node SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" @@ -14,10 +15,11 @@ echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` echo "Subject ID: $SUBJECT_ID" -source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh -conda activate pydfc -python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \ +# Activate virtual environment +source "/home/mt00/venvs/pydfc/bin/activate" + +python "/home/mt00/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \ --dataset_info $DATASET_INFO \ --participant_id $SUBJECT_ID -conda deactivate +deactivate diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh index 2a00cc5..11167d8 100644 --- a/task_dFC/run_scripts_slurm/run_report.sh +++ b/task_dFC/run_scripts_slurm/run_report.sh @@ -1,18 +1,20 @@ #!/bin/sh # -#$ -cwd -#$ -o logs/report_out.txt -#$ -e logs/report_err.txt -#$ -l h_vmem=16G -#$ -q origami.q +#SBATCH --job-name=report_job # Optional: Name of your job +#SBATCH --output=logs/report_out.txt # Standard output log +#SBATCH --error=logs/report_err.txt # Standard error log +#SBATCH --account=def-jbpoline # Account +#SBATCH --time=24:00:00 # Walltime for each task (24 hours) +#SBATCH --mem=16G # Memory request per node DATASET_INFO="./dataset_info.json" SUBJ_LIST="./subj_list.txt" -source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh -conda activate pydfc -python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \ +# Activate virtual environment +source "/home/mt00/venvs/pydfc/bin/activate" + +python "/home/mt00/pydfc/dFC/task_dFC/generate_report.py" \ --dataset_info $DATASET_INFO \ --subj_list $SUBJ_LIST -conda deactivate +deactivate From f0326c3d5d88286a6e9d6b2218d0b0a3261fc27b Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 13:22:07 -0400 Subject: [PATCH 122/401] minor fix --- simul_dFC/task_data_simulator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index ba3b6c5..3dd8315 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -39,8 +39,10 @@ # create a subject id list subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)] -job_id = int(os.getenv("SGE_TASK_ID")) -subj_id = subj_list[job_id - 1] # SGE_TASK_ID starts from 1 not 0 +job_id = int(os.getenv("SGE_TASK_ID")) # for SGE +if job_id is None: + job_id = int(os.getenv("SLURM_ARRAY_TASK_ID")) # for SLURM +subj_id = subj_list[job_id - 1] # TASK_ID starts from 1 not 0 print(f"subject-level simulation started running ... for subject: {subj_id} ...") From 98caf6942789f3c902d43fa53f9b7df36430f4e2 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 13:26:55 -0400 Subject: [PATCH 123/401] add SLURM_ARRAY_TASK_ID --- task_dFC/FCS_estimate.py | 6 ++++-- task_dFC/ML.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index 064988c..0fc67ae 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -120,8 +120,10 @@ def run_FCS_estimate( TASKS = dataset_info["TASKS"] - job_id = int(os.getenv("SGE_TASK_ID")) - TASK_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 + job_id = int(os.getenv("SGE_TASK_ID")) # for SGE + if job_id is None: + job_id = int(os.getenv("SLURM_ARRAY_TASK_ID")) # for SLURM + TASK_id = job_id - 1 # TASK_ID starts from 1 not 0 if TASK_id >= len(TASKS): print("TASK_id out of TASKS") exit() diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 9a473dd..4674f59 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -279,8 +279,10 @@ def run_task_paradigm_clustering( traceback.print_exc() print("Task features extraction finished.") - job_id = int(os.getenv("SGE_TASK_ID")) - dFC_id = job_id - 1 # SGE_TASK_ID starts from 1 not 0 + job_id = int(os.getenv("SGE_TASK_ID")) # for SGE + if job_id is None: + job_id = int(os.getenv("SLURM_ARRAY_TASK_ID")) # for SLURM + dFC_id = job_id - 1 # TASK_ID starts from 1 not 0 print(f"Task presence classification started for dFC ID {dFC_id}...") try: From fa0632cfd6314d752f2e3083ef96488ae4da2b4a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 13:46:33 -0400 Subject: [PATCH 124/401] minor fix --- simul_dFC/task_data_simulator.py | 5 +++-- task_dFC/FCS_estimate.py | 5 +++-- task_dFC/ML.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 3dd8315..5b081da 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -39,9 +39,10 @@ # create a subject id list subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)] -job_id = int(os.getenv("SGE_TASK_ID")) # for SGE +job_id = os.getenv("SGE_TASK_ID") # for SGE if job_id is None: - job_id = int(os.getenv("SLURM_ARRAY_TASK_ID")) # for SLURM + job_id = os.getenv("SLURM_ARRAY_TASK_ID") # for SLURM +job_id = int(job_id) subj_id = subj_list[job_id - 1] # TASK_ID starts from 1 not 0 print(f"subject-level simulation started running ... for subject: {subj_id} ...") diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py index 0fc67ae..e54ef11 100644 --- a/task_dFC/FCS_estimate.py +++ b/task_dFC/FCS_estimate.py @@ -120,9 +120,10 @@ def run_FCS_estimate( TASKS = dataset_info["TASKS"] - job_id = int(os.getenv("SGE_TASK_ID")) # for SGE + job_id = os.getenv("SGE_TASK_ID") # for SGE if job_id is None: - job_id = int(os.getenv("SLURM_ARRAY_TASK_ID")) # for SLURM + job_id = os.getenv("SLURM_ARRAY_TASK_ID") # for SLURM + job_id = int(job_id) TASK_id = job_id - 1 # TASK_ID starts from 1 not 0 if TASK_id >= len(TASKS): print("TASK_id out of TASKS") diff --git a/task_dFC/ML.py b/task_dFC/ML.py index 4674f59..f05b4a4 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -279,9 +279,10 @@ def run_task_paradigm_clustering( traceback.print_exc() print("Task features extraction finished.") - job_id = int(os.getenv("SGE_TASK_ID")) # for SGE + job_id = os.getenv("SGE_TASK_ID") # for SGE if job_id is None: - job_id = int(os.getenv("SLURM_ARRAY_TASK_ID")) # for SLURM + job_id = os.getenv("SLURM_ARRAY_TASK_ID") # for SLURM + job_id = int(job_id) dFC_id = job_id - 1 # TASK_ID starts from 1 not 0 print(f"Task presence classification started for dFC ID {dFC_id}...") From 8e47e2b0b9f88158bbb1058d1bb5685db42e7a1f Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 20 Sep 2024 15:28:50 -0400 Subject: [PATCH 125/401] add dataset info to simulator --- simul_dFC/run_scripts_sge/run_simulator.sh | 7 +++- simul_dFC/run_scripts_slurm/run_simulator.sh | 5 ++- simul_dFC/task_data_simulator.py | 34 ++++++++++++++++---- task_dFC/nifti_to_roi_signal.py | 1 - 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/simul_dFC/run_scripts_sge/run_simulator.sh b/simul_dFC/run_scripts_sge/run_simulator.sh index e7f6394..6176236 100644 --- a/simul_dFC/run_scripts_sge/run_simulator.sh +++ b/simul_dFC/run_scripts_sge/run_simulator.sh @@ -8,7 +8,12 @@ #$ -l h_vmem=8G #$ -t 1:200 +DATASET_INFO="./dataset_info.json" + source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh conda activate pydfc -python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py" + +python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py" \ +--dataset_info $DATASET_INFO + conda deactivate diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh index f7f8998..21e669e 100644 --- a/simul_dFC/run_scripts_slurm/run_simulator.sh +++ b/simul_dFC/run_scripts_slurm/run_simulator.sh @@ -8,11 +8,14 @@ #SBATCH --mem=8G # Memory request per node #SBATCH --array=1-200 # Task array specification +DATASET_INFO="./dataset_info.json" + # Activate virtual environment source "/home/mt00/venvs/pydfc/bin/activate" # Run Python script -python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" +python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" \ +--dataset_info $DATASET_INFO # Deactivate environment deactivate diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py index 5b081da..15d43d7 100644 --- a/simul_dFC/task_data_simulator.py +++ b/simul_dFC/task_data_simulator.py @@ -4,6 +4,8 @@ @author: mte """ +import argparse +import json import os import warnings @@ -19,12 +21,6 @@ os.environ["OMP_NUM_THREADS"] = "16" ################################# Parameters #################################### -# data paths -dataset = "ds000002" -# main_root = f"./DATA/{dataset}" # for local -main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}" # for server -output_root = f"{main_root}/derivatives/ROI_timeseries" - # simulation parameters sim_length = 250e3 # in m sec onset_time = 20.0 # in seconds @@ -36,6 +32,32 @@ dt = 0.5 # integration step n_subj = 200 # number of subjects +# argparse +HELPTEXT = """ +Script to simulate task-based data. +""" +parser = argparse.ArgumentParser(description=HELPTEXT) + +parser.add_argument("--dataset_info", type=str, help="path to dataset info file") + +args = parser.parse_args() + +dataset_info_file = args.dataset_info + +# Read dataset info +with open(dataset_info_file, "r") as f: + dataset_info = json.load(f) + +if "{dataset}" in dataset_info["main_root"]: + main_root = dataset_info["main_root"].replace("{dataset}", dataset_info["dataset"]) +else: + main_root = dataset_info["main_root"] + +if "{main_root}" in dataset_info["roi_root"]: + output_root = dataset_info["roi_root"].replace("{main_root}", main_root) +else: + output_root = dataset_info["roi_root"] + # create a subject id list subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)] diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 56880e4..e8d7aa1 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -2,7 +2,6 @@ import json import os import warnings -from re import A import numpy as np From 5c2ecc74013936a781532e3462cb5e7ffffa6661 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 23 Sep 2024 23:40:53 -0400 Subject: [PATCH 126/401] minor fix --- task_dFC/run_scripts_slurm/run_dFC.sh | 2 +- task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh index 84edbb9..8c33edd 100644 --- a/task_dFC/run_scripts_slurm/run_dFC.sh +++ b/task_dFC/run_scripts_slurm/run_dFC.sh @@ -12,7 +12,7 @@ DATASET_INFO="./dataset_info.json" echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" -SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST` echo "Subject ID: $SUBJECT_ID" # Activate virtual environment diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh index 0462e86..419efc7 100644 --- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh +++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh @@ -12,7 +12,7 @@ DATASET_INFO="./dataset_info.json" echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`" -SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST` +SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST` echo "Subject ID: $SUBJECT_ID" # Activate virtual environment From 31e845a46401844838dc845b9307bb94e9344419 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 24 Sep 2024 14:16:59 -0400 Subject: [PATCH 127/401] dfc embed error handle --- pydfc/ml_utils.py | 133 +++++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 61 deletions(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index b3d9f9f..32ded52 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -457,7 +457,7 @@ def generalized_procrustes(X_list): except: continue - raise ValueError("Generalized Procrustes Analysis did not converge.") + raise RuntimeError("Generalized Procrustes Analysis did not converge.") def twonn(X, discard_ratio=0.1): @@ -523,20 +523,23 @@ def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125): SI_score = {} for n_components in search_range: - X_train_embed, _ = embed_dFC_features( - train_subjects=["subj"], - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=np.array(["subj"] * len(y)), - subj_label_test=None, - embedding="LE", - n_components=n_components, - n_neighbors_LE=n_neighbors_LE, - LE_embedding_method="embed+procrustes", - ) + try: + X_train_embed, _ = embed_dFC_features( + train_subjects=["subj"], + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=np.array(["subj"] * len(y)), + subj_label_test=None, + embedding="LE", + n_components=n_components, + n_neighbors_LE=n_neighbors_LE, + LE_embedding_method="embed+procrustes", + ) + except: + continue SI_score[n_components] = silhouette_score(X_train_embed, y) @@ -1082,7 +1085,7 @@ def task_presence_classification( ) ) - ML_RESULT = {} + ML_RESULT = {"PCA": {}, "LE": {}} ML_scores = { "subj_id": list(), "group": list(), @@ -1097,20 +1100,23 @@ def task_presence_classification( } for embedding in ["PCA", "LE"]: # embed dFC features - X_train_embedded, X_test_embedded = embed_dFC_features( - train_subjects=train_subjects, - test_subjects=test_subjects, - X_train=X_train, - X_test=X_test, - y_train=y_train, - y_test=y_test, - subj_label_train=subj_label_train, - subj_label_test=subj_label_test, - embedding=embedding, - n_components="auto", - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) + try: + X_train_embedded, X_test_embedded = embed_dFC_features( + train_subjects=train_subjects, + test_subjects=test_subjects, + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test, + subj_label_train=subj_label_train, + subj_label_test=subj_label_test, + embedding=embedding, + n_components="auto", + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + except: + continue # task presence classification @@ -1134,7 +1140,6 @@ def task_presence_classification( # X_train_embedded, y_train, X_test_embedded, y_test # ) - ML_RESULT[embedding] = {} for key in log_reg_RESULT: ML_RESULT[embedding][key] = log_reg_RESULT[key] for key in KNN_RESULT: @@ -1224,7 +1229,7 @@ def task_presence_clustering( normalize_dFC=normalize_dFC, ) - clustering_RESULTS = {} + clustering_RESULTS = {"PCA": {}, "LE": {}} clustering_scores = { "subj_id": list(), "task": list(), @@ -1236,20 +1241,23 @@ def task_presence_clustering( } for embedding in ["PCA", "LE"]: # embed dFC features - X_embedded, _ = embed_dFC_features( - train_subjects=SUBJECTS, - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=subj_label, - subj_label_test=None, - embedding=embedding, - n_components="auto", - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) + try: + X_embedded, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding=embedding, + n_components="auto", + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + except: + continue # clustering # apply kmeans clustering to dFC features @@ -1369,23 +1377,26 @@ def task_paradigm_clustering( y = y[idx] subj_label = subj_label[idx] - task_paradigm_clstr_RESULTS = {} + task_paradigm_clstr_RESULTS = {"PCA": {}, "LE": {}} for embedding in ["PCA", "LE"]: # embed dFC features - X_embed, _ = embed_dFC_features( - train_subjects=SUBJECTS, - test_subjects=[], - X_train=X, - X_test=None, - y_train=y, - y_test=None, - subj_label_train=subj_label, - subj_label_test=None, - embedding=embedding, - n_components="auto", - n_neighbors_LE=125, - LE_embedding_method="embed+procrustes", - ) + try: + X_embed, _ = embed_dFC_features( + train_subjects=SUBJECTS, + test_subjects=[], + X_train=X, + X_test=None, + y_train=y, + y_test=None, + subj_label_train=subj_label, + subj_label_test=None, + embedding=embedding, + n_components="auto", + n_neighbors_LE=125, + LE_embedding_method="embed+procrustes", + ) + except: + continue # clustering # apply kmeans clustering to dFC features From 427c3c694ccb9bfdfeaf28ab49dd29a8f47789b1 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 24 Sep 2024 16:44:05 -0400 Subject: [PATCH 128/401] minor change --- task_dFC/ML.py | 99 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/task_dFC/ML.py b/task_dFC/ML.py index f05b4a4..c965319 100644 --- a/task_dFC/ML.py +++ b/task_dFC/ML.py @@ -79,22 +79,28 @@ def run_classification( for task_id, task in enumerate(TASKS): ML_RESULT[task] = {} for run in RUNS[task]: - ML_RESULT_new, ML_scores_new = task_presence_classification( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - dynamic_pred=dynamic_pred, - normalize_dFC=normalize_dFC, - ) - if run is None: - ML_RESULT[task] = ML_RESULT_new - else: - ML_RESULT[task][run] = ML_RESULT_new - for key in ML_scores: - ML_scores[key].extend(ML_scores_new[key]) + try: + ML_RESULT_new, ML_scores_new = task_presence_classification( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + dynamic_pred=dynamic_pred, + normalize_dFC=normalize_dFC, + ) + if run is None: + ML_RESULT[task] = ML_RESULT_new + else: + ML_RESULT[task][run] = ML_RESULT_new + for key in ML_scores: + ML_scores[key].extend(ML_scores_new[key]) + except Exception as e: + print( + f"Error in task presence classification for {session} {task} {run}: {e}" + ) + traceback.print_exc() if session is None: folder = f"{output_root}" @@ -137,21 +143,29 @@ def run_clustering( for task_id, task in enumerate(TASKS): clustering_RESULTS[task] = {} for run in RUNS[task]: - clustering_RESULTS_new, clustering_scores_new = task_presence_clustering( - task=task, - dFC_id=dFC_id, - roi_root=roi_root, - dFC_root=dFC_root, - run=run, - session=session, - normalize_dFC=normalize_dFC, - ) - if run is None: - clustering_RESULTS[task] = clustering_RESULTS_new - else: - clustering_RESULTS[task][run] = clustering_RESULTS_new - for key in clustering_scores: - clustering_scores[key].extend(clustering_scores_new[key]) + try: + clustering_RESULTS_new, clustering_scores_new = ( + task_presence_clustering( + task=task, + dFC_id=dFC_id, + roi_root=roi_root, + dFC_root=dFC_root, + run=run, + session=session, + normalize_dFC=normalize_dFC, + ) + ) + if run is None: + clustering_RESULTS[task] = clustering_RESULTS_new + else: + clustering_RESULTS[task][run] = clustering_RESULTS_new + for key in clustering_scores: + clustering_scores[key].extend(clustering_scores_new[key]) + except Exception as e: + print( + f"Error in task presence clustering for {session} {task} {run}: {e}" + ) + traceback.print_exc() if session is None: folder = f"{output_root}" @@ -179,15 +193,20 @@ def run_task_paradigm_clustering( ): for session in SESSIONS: - task_paradigm_clstr_RESULTS = task_paradigm_clustering( - dFC_id=dFC_id, - TASKS=TASKS, - RUNS=RUNS, - session=session, - roi_root=roi_root, - dFC_root=dFC_root, - normalize_dFC=normalize_dFC, - ) + try: + task_paradigm_clstr_RESULTS = task_paradigm_clustering( + dFC_id=dFC_id, + TASKS=TASKS, + RUNS=RUNS, + session=session, + roi_root=roi_root, + dFC_root=dFC_root, + normalize_dFC=normalize_dFC, + ) + except Exception as e: + print(f"Error in task paradigm clustering for {session}: {e}") + traceback.print_exc() + continue if session is None: folder = f"{output_root}" From 943cd7be6aed1ce42fcaa6451303c9e32a2fc4c3 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 3 Oct 2024 10:24:46 -0400 Subject: [PATCH 129/401] slurm change --- task_dFC/run_scripts_slurm/global_config.json | 4 ++-- task_dFC/run_scripts_slurm/run_ML.sh | 2 +- task_dFC/run_scripts_slurm/run_fmriprep.sh | 6 +++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/task_dFC/run_scripts_slurm/global_config.json b/task_dFC/run_scripts_slurm/global_config.json index 0e0681c..a99d2d7 100644 --- a/task_dFC/run_scripts_slurm/global_config.json +++ b/task_dFC/run_scripts_slurm/global_config.json @@ -3,10 +3,10 @@ "VISIT_IDS": [], "SESSION_IDS": [], "SUBSTITUTIONS": { - "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/", + "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy", "[[HEUDICONV_HEURISTIC_FILE]]": "", "[[DCM2BIDS_CONFIG_FILE]]": "", - "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/", + "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/license.txt", "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow" }, "DICOM_DIR_PARTICIPANT_FIRST": true, diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh index ff4a52b..4b166fd 100644 --- a/task_dFC/run_scripts_slurm/run_ML.sh +++ b/task_dFC/run_scripts_slurm/run_ML.sh @@ -5,7 +5,7 @@ #SBATCH --error=logs/ML_err.txt # Standard error log #SBATCH --account=def-jbpoline # Account #SBATCH --time=72:00:00 # Walltime for each task (72 hours) -#SBATCH --mem=64G # Memory request per node +#SBATCH --mem=70G # Memory request per node DATASET_INFO="./dataset_info.json" diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index ed9306f..3bfd8a3 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -4,8 +4,12 @@ #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log #SBATCH --time=24:00:00 # Walltime (24 hours) -#SBATCH --mem=32G # Memory (32 GB) +#SBATCH --mem=64G # Memory (64 GB) +#SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=def-jbpoline # Account +#SBATCH --tmp=100G # Allocate 100GB of temporary space + +module load apptainer source "/home/mt00/venvs/nipoppy_env/bin/activate" From 8292a676ba3739b2f00150910c2a47857b456bd3 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Fri, 11 Oct 2024 13:06:09 -0400 Subject: [PATCH 130/401] minor change --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 3bfd8a3..5e60266 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=fmriprep_job # Name of the job #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log -#SBATCH --time=24:00:00 # Walltime (24 hours) +#SBATCH --time=72:00:00 # Walltime (72 hours) #SBATCH --mem=64G # Memory (64 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=def-jbpoline # Account From b5ea93d867d2ae21cea915b26785adc70bc66c2a Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 17 Oct 2024 01:00:51 -0400 Subject: [PATCH 131/401] minor change --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 5e60266..e78a88c 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=fmriprep_job # Name of the job #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log -#SBATCH --time=72:00:00 # Walltime (72 hours) +#SBATCH --time=10-00:00:00 # Walltime (10 days) #SBATCH --mem=64G # Memory (64 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=def-jbpoline # Account From e21701ed85d6d994d9dce307f506ac7cce907a71 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 17 Oct 2024 01:02:37 -0400 Subject: [PATCH 132/401] minor change --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index e78a88c..a2721b7 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=fmriprep_job # Name of the job #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log -#SBATCH --time=10-00:00:00 # Walltime (10 days) +#SBATCH --time=7-00:00:00 # Walltime (7 days) #SBATCH --mem=64G # Memory (64 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=def-jbpoline # Account From ca0cbacbcae4cec8b8531cc92831758491081347 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 22 Oct 2024 16:59:44 -0400 Subject: [PATCH 133/401] minor change --- task_dFC/run_scripts_slurm/run_FCS.sh | 2 +- task_dFC/run_scripts_slurm/run_ML.sh | 2 +- task_dFC/run_scripts_slurm/run_dFC.sh | 2 +- task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +- task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 2 +- task_dFC/run_scripts_slurm/run_report.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh index b4d3b52..7ef0058 100644 --- a/task_dFC/run_scripts_slurm/run_FCS.sh +++ b/task_dFC/run_scripts_slurm/run_FCS.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=fit_fcs_job # Optional: Name of your job #SBATCH --output=logs/fcs_out.txt # Standard output log #SBATCH --error=logs/fcs_err.txt # Standard error log -#SBATCH --account=def-jbpoline # Account +#SBATCH --account=rrg-jbpoline # Account #SBATCH --time=96:00:00 # Walltime for each task (96 hours) #SBATCH --mem=64G # Memory request per node diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh index 4b166fd..da8c6cc 100644 --- a/task_dFC/run_scripts_slurm/run_ML.sh +++ b/task_dFC/run_scripts_slurm/run_ML.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=ML_job # Optional: Name of your job #SBATCH --output=logs/ML_out.txt # Standard output log #SBATCH --error=logs/ML_err.txt # Standard error log -#SBATCH --account=def-jbpoline # Account +#SBATCH --account=rrg-jbpoline # Account #SBATCH --time=72:00:00 # Walltime for each task (72 hours) #SBATCH --mem=70G # Memory request per node diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh index 8c33edd..e329fd0 100644 --- a/task_dFC/run_scripts_slurm/run_dFC.sh +++ b/task_dFC/run_scripts_slurm/run_dFC.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=assess_dfc_job # Optional: Name of your job #SBATCH --output=logs/dfc_out.txt # Standard output log #SBATCH --error=logs/dfc_err.txt # Standard error log -#SBATCH --account=def-jbpoline # Account +#SBATCH --account=rrg-jbpoline # Account #SBATCH --time=24:00:00 # Walltime for each task (24 hours) #SBATCH --mem=32G # Memory request per node diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index a2721b7..7183c19 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -6,7 +6,7 @@ #SBATCH --time=7-00:00:00 # Walltime (7 days) #SBATCH --mem=64G # Memory (64 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) -#SBATCH --account=def-jbpoline # Account +#SBATCH --account=rrg-jbpoline # Account #SBATCH --tmp=100G # Allocate 100GB of temporary space module load apptainer diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh index 419efc7..36ada93 100644 --- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh +++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=extract_roi_job # Optional: Name of your job #SBATCH --output=logs/roi_out.txt # Standard output log #SBATCH --error=logs/roi_err.txt # Standard error log -#SBATCH --account=def-jbpoline # Account +#SBATCH --account=rrg-jbpoline # Account #SBATCH --time=24:00:00 # Walltime for each task (24 hours) #SBATCH --mem=32G # Memory request per node diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh index 11167d8..57b6634 100644 --- a/task_dFC/run_scripts_slurm/run_report.sh +++ b/task_dFC/run_scripts_slurm/run_report.sh @@ -3,7 +3,7 @@ #SBATCH --job-name=report_job # Optional: Name of your job #SBATCH --output=logs/report_out.txt # Standard output log #SBATCH --error=logs/report_err.txt # Standard error log -#SBATCH --account=def-jbpoline # Account +#SBATCH --account=rrg-jbpoline # Account #SBATCH --time=24:00:00 # Walltime for each task (24 hours) #SBATCH --mem=16G # Memory request per node From 98dd86cf9bb1839a12b576db8ee0fc35489f29f9 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 29 Oct 2024 13:47:09 -0400 Subject: [PATCH 134/401] minor --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 7183c19..de517f4 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -7,7 +7,6 @@ #SBATCH --mem=64G # Memory (64 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=rrg-jbpoline # Account -#SBATCH --tmp=100G # Allocate 100GB of temporary space module load apptainer From 5f4858abafdc11229c98d472c0d85d70914be27c Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 4 Nov 2024 15:14:42 -0500 Subject: [PATCH 135/401] modify run_fmriprep --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index de517f4..130ca58 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -4,7 +4,7 @@ #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log #SBATCH --time=7-00:00:00 # Walltime (7 days) -#SBATCH --mem=64G # Memory (64 GB) +#SBATCH --mem=32G # Memory (32 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=rrg-jbpoline # Account @@ -20,8 +20,8 @@ SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST) echo "Subject ID: $SUBJECT_ID" nipoppy run \ +"$(dirname "$(pwd)")" \ --pipeline fmriprep \ ---dataset-root "$(dirname "$(pwd)")" \ --participant-id $SUBJECT_ID deactivate From 1ab15c46e7d9d0d25182011656c2a6a0a20196fb Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 5 Nov 2024 11:55:21 -0500 Subject: [PATCH 136/401] eigen_solver --- pydfc/ml_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py index 32ded52..c53d8b9 100644 --- a/pydfc/ml_utils.py +++ b/pydfc/ml_utils.py @@ -620,7 +620,7 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"): n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors_to_be_used, - # eigen_solver="lobpcg", + eigen_solver="lobpcg", ) X_embed = LE.fit_transform(X=affinity_matrix) return X_embed From 752aacc35444c2aab2bc34bff8e979ec5541b100 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 11 Nov 2024 14:23:01 -0500 Subject: [PATCH 137/401] change confound strategy to simple --- pydfc/data_loader.py | 12 +++++++++++- task_dFC/nifti_to_roi_signal.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pydfc/data_loader.py b/pydfc/data_loader.py index fba1ced..7f65959 100644 --- a/pydfc/data_loader.py +++ b/pydfc/data_loader.py @@ -167,9 +167,12 @@ def nifti2array(nifti_file, confound_strategy="none", standardize=False, n_rois= 'no_motion_no_gsr': motion parameters are used and global signal regression is applied. + 'simple': nilearn's simple preprocessing with + full motion and basic wm_csf + and high_pass """ from nilearn import datasets - from nilearn.interfaces.fmriprep import load_confounds + from nilearn.interfaces.fmriprep import load_confounds, load_confounds_strategy from nilearn.maskers import NiftiLabelsMasker from nilearn.plotting import find_parcellation_cut_coords @@ -223,6 +226,13 @@ def nifti2array(nifti_file, confound_strategy="none", standardize=False, n_rois= time_series = masker.fit_transform( nifti_file, confounds=confounds_simple, sample_mask=sample_mask ) + elif confound_strategy == "simple": + confounds_simple, sample_mask = load_confounds_strategy( + nifti_file, denoise_strategy="simple" + ) + time_series = masker.fit_transform( + nifti_file, confounds=confounds_simple, sample_mask=sample_mask + ) return time_series, labels, locs diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index e8d7aa1..00f010b 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -97,7 +97,7 @@ def run_roi_signal_extraction( n_rois=100, Fs=1 / TR_mri, subj_id=subj, - confound_strategy="no_motion", + confound_strategy="simple", standardize="zscore", TS_name="BOLD", session=task, From 3468a54472a4564ec574e1695f98babf2ff53bb7 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 11 Nov 2024 17:18:06 -0500 Subject: [PATCH 138/401] change fmriprep root for slurm --- task_dFC/nifti_to_roi_signal.py | 4 ++++ task_dFC/run_scripts_slurm/dataset_info.json | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 00f010b..5d81b6d 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -241,6 +241,10 @@ def run_roi_signal_extraction( if "{main_root}" in dataset_info["fmriprep_root"]: fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root) + elif "{dataset}" in dataset_info["fmriprep_root"]: + fmriprep_root = dataset_info["fmriprep_root"].replace( + "{dataset}", dataset_info["dataset"] + ) else: fmriprep_root = dataset_info["fmriprep_root"] diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json index e466511..bc39b0d 100644 --- a/task_dFC/run_scripts_slurm/dataset_info.json +++ b/task_dFC/run_scripts_slurm/dataset_info.json @@ -1,7 +1,7 @@ { "dataset" : "", "main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}", - "fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output", + "fmriprep_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/derivatives/fmriprep/23.1.3/output", "roi_root" : "{main_root}/derivatives/ROI_timeseries", "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", "dFC_root" : "{main_root}/derivatives/dFC_assessed", From 0df68ad830a935bdb9aa36135bd75544ffeea3c2 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Mon, 11 Nov 2024 23:47:56 -0500 Subject: [PATCH 139/401] add bids_root --- task_dFC/nifti_to_roi_signal.py | 29 +++++++++++++------- task_dFC/run_scripts_sge/dataset_info.json | 1 + task_dFC/run_scripts_slurm/dataset_info.json | 1 + 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py index 5d81b6d..e4216a4 100644 --- a/task_dFC/nifti_to_roi_signal.py +++ b/task_dFC/nifti_to_roi_signal.py @@ -14,7 +14,7 @@ def run_roi_signal_extraction( subj, task, - main_root, + bids_root, fmriprep_root, bold_suffix, output_root, @@ -59,22 +59,22 @@ def run_roi_signal_extraction( task_file = [file_i for file_i in ALL_TASK_FILES if f"_{run}_" in file_i][0] if session is None: nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}" - task_events_root = f"{main_root}/bids/{subj}/func" + task_events_root = f"{bids_root}/{subj}/func" else: nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}" - task_events_root = f"{main_root}/bids/{subj}/{session}/func" + task_events_root = f"{bids_root}/{subj}/{session}/func" info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}" - # in some cases the info file is common for all subjects and can be found in f"{main_root}/bids" + # in some cases the info file is common for all subjects and can be found in f"{bids_root}" if not os.path.exists(info_file): - ALL_COMMON_FILES = os.listdir(f"{main_root}/bids/") + ALL_COMMON_FILES = os.listdir(f"{bids_root}/") ALL_COMMON_FILES = [ file_i for file_i in ALL_COMMON_FILES if (f"{task}_" in file_i) and ("_bold.json" in file_i) ] if len(ALL_COMMON_FILES) == 1: - info_file = f"{main_root}/bids/{ALL_COMMON_FILES[0]}" + info_file = f"{bids_root}/{ALL_COMMON_FILES[0]}" if not os.path.exists(info_file): # if the info file is not found, exclude the subject if run is None: @@ -124,15 +124,15 @@ def run_roi_signal_extraction( ] if not len(ALL_EVENTS_FILES) == 1: - # in some cases the event file is common for all subjects and can be found in f"{main_root}/bids" - ALL_EVENTS_FILES_COMMON = os.listdir(f"{main_root}/bids/") + # in some cases the event file is common for all subjects and can be found in f"{bids_root}" + ALL_EVENTS_FILES_COMMON = os.listdir(f"{bids_root}/") ALL_EVENTS_FILES_COMMON = [ file_i for file_i in ALL_EVENTS_FILES_COMMON if (f"{task}_" in file_i) and ("events.tsv" in file_i) ] if len(ALL_EVENTS_FILES_COMMON) == 1: - events_file = f"{main_root}/bids/{ALL_EVENTS_FILES_COMMON[0]}" + events_file = f"{bids_root}/{ALL_EVENTS_FILES_COMMON[0]}" else: # if the events file is not found, exclude the subject if run is None: @@ -239,6 +239,15 @@ def run_roi_signal_extraction( else: main_root = dataset_info["main_root"] + if "{main_root}" in dataset_info["bids_root"]: + bids_root = dataset_info["bids_root"].replace("{main_root}", main_root) + elif "{dataset}" in dataset_info["bids_root"]: + bids_root = dataset_info["bids_root"].replace( + "{dataset}", dataset_info["dataset"] + ) + else: + bids_root = dataset_info["bids_root"] + if "{main_root}" in dataset_info["fmriprep_root"]: fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root) elif "{dataset}" in dataset_info["fmriprep_root"]: @@ -261,7 +270,7 @@ def run_roi_signal_extraction( run_roi_signal_extraction( subj=participant_id, task=task, - main_root=main_root, + bids_root=bids_root, fmriprep_root=fmriprep_root, bold_suffix=dataset_info["bold_suffix"], output_root=output_root, diff --git a/task_dFC/run_scripts_sge/dataset_info.json b/task_dFC/run_scripts_sge/dataset_info.json index 16d775e..30531e6 100644 --- a/task_dFC/run_scripts_sge/dataset_info.json +++ b/task_dFC/run_scripts_sge/dataset_info.json @@ -1,6 +1,7 @@ { "dataset" : "", "main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}", + "bids_root" : "{main_root}/bids", "fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output", "roi_root" : "{main_root}/derivatives/ROI_timeseries", "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json index bc39b0d..74f4ddf 100644 --- a/task_dFC/run_scripts_slurm/dataset_info.json +++ b/task_dFC/run_scripts_slurm/dataset_info.json @@ -1,6 +1,7 @@ { "dataset" : "", "main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}", + "bids_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/bids", "fmriprep_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/derivatives/fmriprep/23.1.3/output", "roi_root" : "{main_root}/derivatives/ROI_timeseries", "fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES", From 8bd3914b401568907be8825f5fc3578a7be5d578 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 12 Nov 2024 13:39:10 -0500 Subject: [PATCH 140/401] minor change --- task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh index 36ada93..6d1d88d 100644 --- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh +++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh @@ -5,7 +5,7 @@ #SBATCH --error=logs/roi_err.txt # Standard error log #SBATCH --account=rrg-jbpoline # Account #SBATCH --time=24:00:00 # Walltime for each task (24 hours) -#SBATCH --mem=32G # Memory request per node +#SBATCH --mem=64G # Memory request per node SUBJECT_LIST="./subj_list.txt" DATASET_INFO="./dataset_info.json" From 1b99922e31dc857a50d977574e5b1925311e487c Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Sun, 17 Nov 2024 21:56:15 -0500 Subject: [PATCH 141/401] minor fix --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 130ca58..8fecaea 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -4,7 +4,7 @@ #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log #SBATCH --time=7-00:00:00 # Walltime (7 days) -#SBATCH --mem=32G # Memory (32 GB) +#SBATCH --mem-per-cpu=16G # Memory (32 GB) #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=rrg-jbpoline # Account From f5188d58a20f7d56280c93e3d024ef99879ad871 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Tue, 19 Nov 2024 11:10:06 -0500 Subject: [PATCH 142/401] slurm change --- task_dFC/run_scripts_slurm/run_fmriprep.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh index 8fecaea..0abfdbc 100644 --- a/task_dFC/run_scripts_slurm/run_fmriprep.sh +++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh @@ -3,8 +3,8 @@ #SBATCH --job-name=fmriprep_job # Name of the job #SBATCH --output=logs/fmriprep_out.log # Standard output log #SBATCH --error=logs/fmriprep_err.log # Standard error log -#SBATCH --time=7-00:00:00 # Walltime (7 days) -#SBATCH --mem-per-cpu=16G # Memory (32 GB) +#SBATCH --time=2-00:00:00 # Walltime (2 day) +#SBATCH --mem-per-cpu=16G # Memory (16 GB) per cpu #SBATCH --cpus-per-task=8 # Number of CPU cores (increase based on availability) #SBATCH --account=rrg-jbpoline # Account From 809389308793e4f12b318a828214f666eaefa5e8 Mon Sep 17 00:00:00 2001 From: mtorabi59 Date: Thu, 21 Nov 2024 15:12:13 -0500 Subject: [PATCH 143/401] add invo and desc for nipoppy --- task_dFC/run_scripts_slurm/descriptor.json | 668 ++++++++++++++++++ task_dFC/run_scripts_slurm/global_config.json | 61 +- task_dFC/run_scripts_slurm/invocation.json | 24 + 3 files changed, 728 insertions(+), 25 deletions(-) create mode 100644 task_dFC/run_scripts_slurm/descriptor.json create mode 100644 task_dFC/run_scripts_slurm/invocation.json diff --git a/task_dFC/run_scripts_slurm/descriptor.json b/task_dFC/run_scripts_slurm/descriptor.json new file mode 100644 index 0000000..f039583 --- /dev/null +++ b/task_dFC/run_scripts_slurm/descriptor.json @@ -0,0 +1,668 @@ +{ + "name": "fmriprep", + "description": "fmriprep", + "tool-version": "23.1.3", + "schema-version": "0.5", + "command-line": "[[NIPOPPY_CONTAINER_COMMAND]] --bind $SLURM_TMPDIR:/work [[NIPOPPY_FPATH_CONTAINER]] [BIDS_DIR] [OUTPUT_DIR] [ANALYSIS_LEVEL] [SKIP_BIDS_VALIDATION] [PARTICIPANT_LABEL] [TASK_ID] [ECHO_IDX] [BIDS_FILTERS] [ANAT_DERIVATIVES] [BIDS_DATABASE_DIR] [NPROCS] [OMP_NTHREADS] [MEMORY_GB] [LOW_MEM] [USE_PLUGIN] [SLOPPY] [ANAT_ONLY] [BOILERPLATE_ONLY] [REPORTS_ONLY] [IGNORE] [OUTPUT_SPACES] [LONGITUDINAL] [BOLD2T1W_INIT] [BOLD2T1W_DOF] [USE_BBR] [SLICE_TIME_REF] [DUMMY_SCANS] [_RANDOM_SEED] [ME_T2S_FIT_METHOD] [OUTPUT_LAYOUT] [ME_OUTPUT_ECHOS] [MEDIAL_SURFACE_NAN] [PROJECT_GOODVOXELS] [MD_ONLY_BOILERPLATE] [CIFTI_OUTPUT] [USE_AROMA] [AROMA_MELODIC_DIM] [AROMA_ERR_ON_WARN] [REGRESSORS_ALL_COMPS] [REGRESSORS_FD_TH] [REGRESSORS_DVARS_TH] [SKULL_STRIP_TEMPLATE] [SKULL_STRIP_FIXED_SEED] [SKULL_STRIP_T1W] [FMAP_BSPLINE] [FMAP_NO_DEMEAN] [USE_SYN_SDC] [FORCE_SYN] [FS_LICENSE_FILE] [FS_SUBJECTS_DIR] [HIRES] [SKIP_RECONALL] [TRACK_CARBON] [COUNTRY_CODE] [VERSION] [VERBOSE_COUNT] [WORK_DIR] [CLEAN_WORKDIR] [RESOURCE_MONITOR] [CONFIG_FILE] [WRITE_GRAPH] [STOP_ON_FIRST_CRASH] [NOTRACK] [DEBUG]", + "inputs": [ + { + "id": "bids_dir", + "name": "bids_dir", + "description": "The root folder of a BIDS valid dataset (sub-XXXXX folders should be found at the top level in this folder).", + "optional": false, + "type": "String", + "value-key": "[BIDS_DIR]" + }, + { + "id": "output_dir", + "name": "output_dir", + "description": "The output path for the outcomes of preprocessing and visual reports", + "optional": false, + "type": "String", + "value-key": "[OUTPUT_DIR]" + }, + { + "id": "analysis_level", + "name": "analysis_level", + "description": "Processing stage to be run, only \"participant\" in the case of fMRIPrep (see BIDS-Apps specification).", + "optional": false, + "type": "String", + "value-key": "[ANALYSIS_LEVEL]", + "value-choices": [ + "participant" + ] + }, + { + "id": "skip_bids_validation", + "name": "skip_bids_validation", + "description": "Assume the input dataset is BIDS compliant and skip the validation", + "optional": true, + "type": "Flag", + "value-key": "[SKIP_BIDS_VALIDATION]", + "command-line-flag": "--skip_bids_validation" + }, + { + "id": "participant_label", + "name": "participant_label", + "description": "A space delimited list of participant identifiers or a single identifier (the sub- prefix can be removed)", + "optional": true, + "type": "String", + "value-key": "[PARTICIPANT_LABEL]", + "list": true, + "command-line-flag": "--participant-label" + }, + { + "id": "task_id", + "name": "task_id", + "description": "Select a specific task to be processed", + "optional": true, + "type": "String", + "value-key": "[TASK_ID]", + "command-line-flag": "-t" + }, + { + "id": "echo_idx", + "name": "echo_idx", + "description": "Select a specific echo to be processed in a multiecho series", + "optional": true, + "type": "Number", + "value-key": "[ECHO_IDX]", + "command-line-flag": "--echo-idx" + }, + { + "id": "bids_filters", + "name": "bids_filters", + "description": "A JSON file describing custom BIDS input filters using PyBIDS. For further details, please check out https://fmriprep.readthedocs.io/en/0/faq.html#how-do-I-select-only-certain-files-to-be-input-to-fMRIPrep", + "optional": true, + "type": "String", + "value-key": "[BIDS_FILTERS]", + "command-line-flag": "--bids-filter-file" + }, + { + "id": "anat_derivatives", + "name": "anat_derivatives", + "description": "Reuse the anatomical derivatives from another fMRIPrep run or calculated with an alternative processing tool (NOT RECOMMENDED).", + "optional": true, + "type": "String", + "value-key": "[ANAT_DERIVATIVES]", + "command-line-flag": "--anat-derivatives" + }, + { + "id": "bids_database_dir", + "name": "bids_database_dir", + "description": "Path to a PyBIDS database folder, for faster indexing (especially useful for large datasets). Will be created if not present.", + "optional": true, + "type": "String", + "value-key": "[BIDS_DATABASE_DIR]", + "command-line-flag": "--bids-database-dir" + }, + { + "id": "nprocs", + "name": "nprocs", + "description": "Maximum number of threads across all processes", + "optional": true, + "type": "String", + "value-key": "[NPROCS]", + "command-line-flag": "--nprocs" + }, + { + "id": "omp_nthreads", + "name": "omp_nthreads", + "description": "Maximum number of threads per-process", + "optional": true, + "type": "String", + "value-key": "[OMP_NTHREADS]", + "command-line-flag": "--omp-nthreads" + }, + { + "id": "memory_gb", + "name": "memory_gb", + "description": "Upper bound memory limit for fMRIPrep processes", + "optional": true, + "type": "String", + "value-key": "[MEMORY_GB]", + "command-line-flag": "--mem" + }, + { + "id": "low_mem", + "name": "low_mem", + "description": "Attempt to reduce memory usage (will increase disk usage in working directory)", + "optional": true, + "type": "Flag", + "value-key": "[LOW_MEM]", + "command-line-flag": "--low-mem" + }, + { + "id": "use_plugin", + "name": "use_plugin", + "description": "Nipype plugin configuration file", + "optional": true, + "type": "String", + "value-key": "[USE_PLUGIN]", + "command-line-flag": "--use-plugin" + }, + { + "id": "sloppy", + "name": "sloppy", + "description": "Use low-quality tools for speed - TESTING ONLY", + "optional": true, + "type": "Flag", + "value-key": "[SLOPPY]", + "command-line-flag": "--sloppy" + }, + { + "id": "anat_only", + "name": "anat_only", + "description": "Run anatomical workflows only", + "optional": true, + "type": "Flag", + "value-key": "[ANAT_ONLY]", + "command-line-flag": "--anat-only" + }, + { + "id": "boilerplate_only", + "name": "boilerplate_only", + "description": "Generate boilerplate only", + "optional": true, + "type": "Flag", + "value-key": "[BOILERPLATE_ONLY]", + "command-line-flag": "--boilerplate-only" + }, + { + "id": "reports_only", + "name": "reports_only", + "description": "Only generate reports, don't run workflows. This will only rerun report aggregation, not reportlet generation for specific nodes.", + "optional": true, + "type": "Flag", + "value-key": "[REPORTS_ONLY]", + "command-line-flag": "--reports-only" + }, + { + "id": "ignore", + "name": "ignore", + "description": "Ignore selected aspects of the input dataset to disable corresponding parts of the workflow (a space delimited list)", + "optional": true, + "type": "String", + "value-key": "[IGNORE]", + "list": true, + "value-choices": [ + "fieldmaps", + "slicetiming", + "sbref", + "t2w", + "flair" + ], + "command-line-flag": "--ignore" + }, + { + "id": "output_spaces", + "name": "output_spaces", + "description": "Standard and non-standard spaces to resample anatomical and functional images to. Standard spaces may be specified by the form ``[:cohort-