From 6a9e5cba965d9a0c4b7e2bb53bde28da689c4509 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 3 Apr 2024 16:04:25 -0400
Subject: [PATCH 001/401] start simul branch

---
 simul_dFC/FCS_estimate.py        | 144 +++++++++++++++++++++++++
 simul_dFC/dFC_assessment.py      | 102 ++++++++++++++++++
 simul_dFC/task_data_simulator.py | 176 +++++++++++++++++++++++++++++++
 3 files changed, 422 insertions(+)
 create mode 100644 simul_dFC/FCS_estimate.py
 create mode 100644 simul_dFC/dFC_assessment.py
 create mode 100644 simul_dFC/task_data_simulator.py

diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py
new file mode 100644
index 0000000..caf2aa0
--- /dev/null
+++ b/simul_dFC/FCS_estimate.py
@@ -0,0 +1,144 @@
+import os
+import time
+import warnings
+
+import numpy as np
+
+from pydfc import MultiAnalysis, data_loader
+
+warnings.simplefilter("ignore")
+
+os.environ["MKL_NUM_THREADS"] = "16"
+os.environ["NUMEXPR_NUM_THREADS"] = "16"
+os.environ["OMP_NUM_THREADS"] = "16"
+
+################################# Parameters #################################
+# data paths
+dataset = "ds000001"
+# main_root = f"./DATA/{dataset}" # for local
+main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
+roi_root = f"{main_root}/derivatives/ROI_timeseries"
+output_root = f"{main_root}/derivatives/fitted_MEASURES"
+
+# for consistency we use 0 for resting state
+TASKS = ["task-pulse"]
+
+# job_id = int(os.getenv("SGE_TASK_ID"))
+# TASK_id = job_id-1 # SGE_TASK_ID starts from 1 not 0
+# if TASK_id >= len(TASKS):
+#     print("TASK_id out of TASKS")
+#     exit()
+TASK_id = 0
+task = TASKS[TASK_id]
+
+###### MEASUREMENT PARAMETERS ######
+
+# W is in sec
+
+params_methods = {
+    # Sliding Parameters
+    "W": 12,
+    "n_overlap": 1.0,
+    "sw_method": "pear_corr",
+    "tapered_window": True,
+    # TIME_FREQ
+    "TF_method": "WTC",
+    # CLUSTERING AND DHMM
+    "clstr_base_measure": "SlidingWindow",
+    # HMM
+    "hmm_iter": 20,
+    "dhmm_obs_state_ratio": 16 / 24,
+    # State Parameters
+    "n_states": 5,
+    "n_subj_clstrs": 10,
+    # Parallelization Parameters
+    "n_jobs": 2,
+    "verbose": 0,
+    "backend": "loky",
+    # SESSION
+    "session": task,
+    # Hyper Parameters
+    "normalization": True,
+    "num_subj": None,  # None or 200?
+    "num_time_point": None,  # None or set?
+}
+
+###### HYPER PARAMETERS ALTERNATIVE ######
+
+MEASURES_name_lst = [
+    "SlidingWindow",
+    "Time-Freq",
+    "CAP",
+    "ContinuousHMM",
+    "Windowless",
+    "Clustering",
+    "DiscreteHMM",
+]
+
+alter_hparams = {
+    # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'],
+    # 'n_overlap': [0, 0.25, 0.75, 1],
+    # 'n_states': [6, 16],
+    # # 'normalization': [],
+    # 'num_subj': [50, 100, 200],
+    # 'num_select_nodes': [30, 50, 333],
+    # 'num_time_point': [800, 1000],
+    # 'Fs_ratio': [0.50, 0.75, 1.5],
+    # 'noise_ratio': [1.00, 2.00, 3.00],
+    # 'num_realization': []
+}
+
+###### MultiAnalysis PARAMETERS ######
+
+params_multi_analysis = {
+    # Parallelization Parameters
+    "n_jobs": None,
+    "verbose": 0,
+    "backend": "loky",
+}
+
+################################# LOAD DATA #################################
+
+BOLD = data_loader.load_TS(
+    data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None
+)
+
+################################# Visualize BOLD #################################
+
+# for session in BOLD:
+#     BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)),
+#         save_image=False, output_root=None)
+
+################################ Measures of dFC #################################
+
+MA = MultiAnalysis(
+    analysis_name=f"simulated-task-based-dFC-{dataset}-{task}", **params_multi_analysis
+)
+
+MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams)
+
+tic = time.time()
+print("Measurement Started ...")
+
+################################# estimate FCS #################################
+
+for MEASURE_id, measure in enumerate(MEASURES_lst):
+
+    print("MEASURE: " + measure.measure_name)
+    print("FCS estimation started...")
+
+    if measure.is_state_based:
+        measure.estimate_FCS(time_series=BOLD)
+
+    # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD)
+    print("FCS estimation done.")
+
+    # Save
+    if not os.path.exists(f"{output_root}/{task}"):
+        os.makedirs(f"{output_root}/{task}")
+    np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure)
+
+print(f"Measurement required {time.time() - tic:0.3f} seconds.")
+np.save(f"{output_root}/{task}/multi_analysis.npy", MA)
+
+#################################################################################
diff --git a/simul_dFC/dFC_assessment.py b/simul_dFC/dFC_assessment.py
new file mode 100644
index 0000000..d140bd6
--- /dev/null
+++ b/simul_dFC/dFC_assessment.py
@@ -0,0 +1,102 @@
+import os
+import time
+import warnings
+
+import numpy as np
+
+from pydfc import MultiAnalysis, data_loader
+
+warnings.simplefilter("ignore")
+
+os.environ["MKL_NUM_THREADS"] = "16"
+os.environ["NUMEXPR_NUM_THREADS"] = "16"
+os.environ["OMP_NUM_THREADS"] = "16"
+
+################################# Parameters #################################
+
+# Data parameters
+dataset = "ds000001"
+# main_root = f"./DATA/{dataset}" # for local
+main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
+
+# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate
+# you can set the new roi root and data load parameters here:
+roi_root = f"{main_root}/derivatives/ROI_timeseries"
+fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES"
+output_root = f"{main_root}/derivatives/dFC_assessed"
+
+# for consistency we use 0 for resting state. will this cause a problem here??
+TASKS = ["task-pulse"]
+
+# find all subjects across all tasks
+SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
+
+# job_id selects the subject
+job_id = int(os.getenv("SGE_TASK_ID"))
+if job_id > len(SUBJECTS):
+    print("job_id > len(SUBJECTS)")
+    exit()
+subj_id = SUBJECTS[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
+
+for task in TASKS:
+
+    MA = np.load(
+        f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE"
+    ).item()
+
+    # check if the subject has this task
+    SUBJECTS_with_this_task = data_loader.find_subj_list(
+        data_root=roi_root, sessions=[task]
+    )
+    if not subj_id in SUBJECTS_with_this_task:
+        print(f"subject {subj_id} not in the list of subjects with task {task}")
+        continue
+
+    ################################# LOAD FIT MEASURES #################################
+
+    ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/")
+    ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i]
+    ALL_RECORDS.sort()
+    MEASURES_fit_lst = list()
+    for s in ALL_RECORDS:
+        fit_measure = np.load(
+            f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE"
+        ).item()
+        MEASURES_fit_lst.append(fit_measure)
+    MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
+    print("fitted MEASURES loaded ...")
+
+    ################################# LOAD DATA #################################
+
+    print(
+        f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..."
+    )
+
+    BOLD = data_loader.load_TS(
+        data_root=roi_root,
+        file_name="time_series.npy",
+        SESSIONs=[task],
+        subj_id2load=subj_id,
+    )
+
+    ################################# dFC ASSESSMENT #################################
+
+    tic = time.time()
+    print("Measurement Started ...")
+
+    print("dFC estimation started...")
+    dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD)
+    print("dFC estimation done.")
+
+    print(f"Measurement required {time.time() - tic:0.3f} seconds.")
+
+    ################################# SAVE DATA #################################
+
+    folder = f"{output_root}/{task}/{subj_id}"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+    for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
+        np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC)
+
+#######################################################################################
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
new file mode 100644
index 0000000..98fa832
--- /dev/null
+++ b/simul_dFC/task_data_simulator.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed March 20 2024
+
+@author: mte
+"""
+import os
+import warnings
+
+import numpy as np
+from tvb.simulator.lab import *
+
+from pydfc import TIME_SERIES, task_utils
+
+warnings.simplefilter("ignore")
+
+os.environ["MKL_NUM_THREADS"] = "16"
+os.environ["NUMEXPR_NUM_THREADS"] = "16"
+os.environ["OMP_NUM_THREADS"] = "16"
+################################# Parameters ####################################
+
+# data paths
+dataset = "ds000002"
+# main_root = f"./DATA/{dataset}" # for local
+main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
+output_root = f"{main_root}/derivatives/ROI_timeseries"
+
+task = "task-pulse"
+
+# simulation parameters
+sim_length = 250e3  # in m sec
+onset_time = 20.0  # in seconds
+task_duration = 12.0  # in seconds
+task_block_duration = 30.0  # in seconds
+BOLD_period = 500  # in m sec
+TAVG_period = 1.0  # in m sec
+conn_speed = 1.0
+D = 0.001  # noise dispersion
+dt = 0.5  # integration step
+n_subj = 200  # number of subjects
+
+# create a subject id list
+subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)]
+
+job_id = int(os.getenv("SGE_TASK_ID"))
+subj_id = subj_list[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
+
+print(f"subject-level simulation started running ... for subject: {subj_id} ...")
+
+# randomize some parameters for each subjects
+onset = np.random.normal(loc=onset_time, scale=0.5)  # seconds
+global_conn_coupling = np.random.normal(loc=0.0126, scale=0.0075)
+rand_weighting = np.array(
+    [
+        np.random.normal(loc=2.0**-2, scale=0.1 * (2.0**-2)),
+        np.random.normal(loc=2.0**-3, scale=0.1 * (2.0**-3)),
+        np.random.normal(loc=2.0**-4, scale=0.1 * (2.0**-4)),
+        np.random.normal(loc=2.0**-5, scale=0.1 * (2.0**-5)),
+        np.random.normal(loc=2.0**-6, scale=0.1 * (2.0**-6)),
+    ]
+)
+conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed)
+################################# Initialize Simulation ####################################
+conn = connectivity.Connectivity.from_file()
+conn.speed = np.array([conn_speed_rand])
+
+# configure stimulus spatial pattern
+weighting = np.zeros((76,))
+weighting[[0, 7, 13, 33, 42]] = rand_weighting
+# weighting[[0, 7, 13, 33, 42]] = numpy.array([2.0 ** -2, 2.0 ** -3, 2.0 ** -4, 2.0 ** -5, 2.0 ** -6])
+
+# temporal profile
+eqn_t = equations.PulseTrain()
+eqn_t.parameters["onset"] = onset * 1e3  # ms
+eqn_t.parameters["tau"] = task_duration * 1e3  # ms
+eqn_t.parameters["T"] = task_block_duration * 1e3  # ms
+
+stimulus = patterns.StimuliRegion(temporal=eqn_t, connectivity=conn, weight=weighting)
+
+################################# Run Simulation ####################################
+
+# set the global coupling strength
+# you can switch between deterministic (without noise) and stochastic integration (with noise)
+sim = simulator.Simulator(
+    model=models.Generic2dOscillator(a=np.array([0.5])),
+    connectivity=conn,
+    coupling=coupling.Linear(a=np.array([global_conn_coupling])),
+    # integrator=integrators.HeunDeterministic(dt=dt),
+    integrator=integrators.HeunStochastic(
+        dt=dt, noise=noise.Additive(nsig=np.array([D]))
+    ),
+    monitors=(
+        monitors.TemporalAverage(period=TAVG_period),
+        monitors.Bold(period=BOLD_period, hrf_kernel=equations.MixtureOfGammas()),
+        monitors.ProgressLogger(period=10e3),
+    ),
+    stimulus=stimulus,
+    simulation_length=sim_length,
+).configure()
+
+(tavg_time, tavg_data), (bold_time, bold_data), _ = sim.run()
+
+# # truncate the first 10 seconds of the simulation
+# # to avoid transient effects
+# truncate_time = 10e3 # in m sec
+# bold_truncate_idx = int(truncate_time / BOLD_period)
+# bold_time = bold_time[bold_truncate_idx:]
+# bold_data = bold_data[bold_truncate_idx:]
+# tavg_truncate_idx = int(truncate_time / TAVG_period)
+# tavg_time = tavg_time[tavg_truncate_idx:]
+# tavg_data = tavg_data[tavg_truncate_idx:]
+
+centres_locs = conn.centres
+region_labels = list(conn.region_labels)
+TR_mri = BOLD_period * 1e-3  # in seconds
+
+bold_data = bold_data[:, 0, :, 0]
+# change time_series.shape to (roi, time)
+bold_data = bold_data.T
+
+time_series = TIME_SERIES(
+    data=bold_data,
+    subj_id=subj_id,
+    Fs=1 / TR_mri,
+    locs=centres_locs,
+    node_labels=region_labels,
+    TS_name=f"BOLD_{subj_id}_{task}",
+    session_name=task,
+)
+num_time_mri = time_series.n_time
+################################# EXTRACT TASK LABELS #########################
+oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
+
+events = []
+event_types = ["rest", "task"]
+TASKS = [task]
+
+# using onset, task_duration, task_block_duration to create the events
+events.append(["onset", "duration", "trial_type"])
+t = onset
+while t < sim_length:
+    events.append([t, task_duration, "task"])
+    t += task_block_duration
+events = np.array(events)
+
+event_labels, Fs_task = task_utils.events_time_to_labels(
+    events=events,
+    TR_mri=TR_mri,
+    num_time_mri=num_time_mri,
+    event_types=event_types,
+    oversampling=oversampling,
+    return_0_1=False,
+)
+# fill task labels with 0 (rest) and 1 (task's index, here only 1 task is used)
+task_labels = np.multiply(event_labels != 0, 1)
+################################# SAVE #################################
+# save the ROI time series and task data
+task_data = {
+    "task": task,
+    "task_labels": task_labels,
+    "task_types": TASKS,
+    "event_labels": event_labels,
+    "event_types": event_types,
+    "events": events,
+    "Fs_task": Fs_task,
+    "TR_mri": TR_mri,
+    "num_time_mri": num_time_mri,
+}
+subj_folder = f"{subj_id}_{task}"
+if not os.path.exists(f"{output_root}/{subj_folder}/"):
+    os.makedirs(f"{output_root}/{subj_folder}/")
+np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
+np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
+
+print("****************** DONE ******************")
+####################################################################################

From 80ab1b0f8024b24c825cbe73a85d681551efa709 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 4 Apr 2024 15:20:58 -0400
Subject: [PATCH 002/401] add task features

---
 pydfc/task_utils.py | 68 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index daaf95e..982a2d8 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -224,11 +224,11 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"):
     return events_hrf_ds
 
 
-def extract_task_presence(event_labels, TR_task, TR_array, TR_mri, binary=True):
+def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=True):
     """
     event_labels: event labels including 0 and event ids at the time each event happens
     TR_task: TR of task
-    TR_array: the time points of the dFC data
+    TR_array: the time points of the dFC data, optional
     TR_mri: TR of MRI
 
     This function extracts the task presence from the event labels and returns it in the same time points as the dFC data
@@ -262,6 +262,68 @@ def extract_task_presence(event_labels, TR_task, TR_array, TR_mri, binary=True):
     task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
 
     # some dFC measures (window-based) have a different TR than the task data
-    task_presence = task_presence[TR_array]
+    if TR_array is not None:
+        task_presence = task_presence[TR_array]
 
     return task_presence
+
+
+################################# Task Features ####################################
+
+
+def relative_task_on(task_presence):
+    """
+    task_presence: 0, 1 array
+    return: relative_task_on
+    """
+    return np.sum(task_presence) / len(task_presence)
+
+
+def task_duration(task_presence, TR_mri):
+    """
+    task_presence: 0, 1 array
+    return: avg_task_duration, var_task_duration
+    """
+    task_durations = list()
+    for i in range(1, len(task_presence)):
+        if task_presence[i] == 1 and task_presence[i - 1] == 0:
+            start = i
+        if task_presence[i] == 0 and task_presence[i - 1] == 1:
+            end = i
+            task_durations.append((end - start) * TR_mri)
+            start = None
+    task_durations = np.array(task_durations)
+    return np.mean(task_durations), np.var(task_durations)
+
+
+def rest_duration(task_presence, TR_mri):
+    """
+    task_presence: 0, 1 array
+    return: avg_rest_duration, var_rest_duration
+    """
+    rest_durations = list()
+    if task_presence[0] == 0:
+        start = 0
+    for i in range(1, len(task_presence)):
+        if task_presence[i] == 0 and task_presence[i - 1] == 1:
+            start = i
+        if task_presence[i] == 1 and task_presence[i - 1] == 0:
+            end = i
+            rest_durations.append((end - start) * TR_mri)
+            start = None
+    if task_presence[-1] == 0:
+        end = len(task_presence)
+        rest_durations.append((end - start) * TR_mri)
+    rest_durations = np.array(rest_durations)
+    return np.mean(rest_durations), np.var(rest_durations)
+
+
+def transition_freq(task_presence):
+    """
+    task_presence: 0, 1 array
+    return: num_of_transitions, relative_transition_freq
+    """
+    transitions = np.abs(np.diff(task_presence))
+    num_of_transitions = np.sum(transitions)
+    relative_transition_freq = num_of_transitions / len(task_presence)
+    return num_of_transitions, relative_transition_freq

From 1300355968abce83ed11fb8f879db021df2884de Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 9 Apr 2024 14:46:02 -0400
Subject: [PATCH 003/401] add KNN_ML

---
 simul_dFC/KNN_ML.py | 249 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 simul_dFC/KNN_ML.py

diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py
new file mode 100644
index 0000000..bf1a6c9
--- /dev/null
+++ b/simul_dFC/KNN_ML.py
@@ -0,0 +1,249 @@
+import os
+
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.metrics import balanced_accuracy_score
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+from pydfc import DFC, data_loader, task_utils
+from pydfc.dfc_utils import dFC_mat2vec, rank_norm
+
+# Data parameters
+dataset = "ds000001"
+
+# main_root = f"./DATA/{dataset}" # for local
+main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
+roi_root = f"{main_root}/derivatives/ROI_timeseries"
+dFC_root = f"{main_root}/derivatives/dFC_assessed"
+output_root = "./ML_RESULTS_KNN_classify"
+
+TASKS = ["task-pulse"]
+
+normalize_dFC = True
+
+SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
+
+# randomly select 80% of the subjects for training and 20% for testing using numpy.random.choice
+train_subjects = np.random.choice(SUBJECTS, int(0.8 * len(SUBJECTS)), replace=False)
+test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
+
+print(
+    f"number of train_subjects: {len(train_subjects)} and test_subjects: {len(test_subjects)}"
+)
+
+
+################## TASK FEATURES ##################
+
+task_features = {
+    "task": list(),
+    "relative_task_on": list(),
+    "avg_task_duration": list(),
+    "var_task_duration": list(),
+    "avg_rest_duration": list(),
+    "var_rest_duration": list(),
+    "num_of_transitions": list(),
+    "relative_transition_freq": list(),
+}
+for task_id, task in enumerate(TASKS):
+
+    if task == "task-restingstate":
+        continue
+
+    for subj in SUBJECTS:
+        # event data
+        task_data = np.load(
+            f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE"
+        ).item()
+        Fs_task = task_data["Fs_task"]
+        TR_task = 1 / Fs_task
+
+        task_presence = task_utils.extract_task_presence(
+            event_labels=task_data["event_labels"],
+            TR_task=TR_task,
+            TR_mri=task_data["TR_mri"],
+            binary=True,
+        )
+
+        relative_task_on = task_utils.relative_task_on(task_presence)
+        # task duration
+        avg_task_duration, var_task_duration = task_utils.task_duration(
+            task_presence, task_data["TR_mri"]
+        )
+        # rest duration
+        avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+            task_presence, task_data["TR_mri"]
+        )
+        # freq of transitions
+        num_of_transitions, relative_transition_freq = task_utils.transition_freq(
+            task_presence
+        )
+
+        task_features["task"].append(task)
+        task_features["relative_task_on"].append(relative_task_on)
+        task_features["avg_task_duration"].append(avg_task_duration)
+        task_features["var_task_duration"].append(var_task_duration)
+        task_features["avg_rest_duration"].append(avg_rest_duration)
+        task_features["var_rest_duration"].append(var_rest_duration)
+        task_features["num_of_transitions"].append(num_of_transitions)
+        task_features["relative_transition_freq"].append(relative_transition_freq)
+
+
+################## TASK PRESENCE CLASSIFICATION ##################
+ML_scores = {
+    "subj_id": list(),
+    "group": list(),
+    "task": list(),
+    "dFC method": list(),
+    "KNN accuracy": list(),
+}
+for dFC_id in range(0, 7):
+    print(f"=================== dFC {dFC_id} ===================")
+
+    ML_RESULT = {}
+    for task_id, task in enumerate(TASKS):
+        print(f"=============== {task} ===============")
+
+        if task == "task-restingstate":
+            continue
+
+        X_train = None
+        X_test = None
+        y_condition_train = None
+        y_condition_test = None
+        subj_label_train = list()
+        subj_label_test = list()
+
+        for subj in SUBJECTS:
+
+            dFC = np.load(
+                f"{dFC_root}/{task}/{subj}/dFC_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+
+            dFC_mat = dFC.get_dFC_mat()
+            TR_array = dFC.TR_array
+            if normalize_dFC:
+                dFC_mat = rank_norm(dFC_mat)
+
+            dFC_vecs = dFC_mat2vec(dFC_mat)
+
+            # event data
+            task_data = np.load(
+                f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE"
+            ).item()
+            Fs_task = task_data["Fs_task"]
+            TR_task = 1 / Fs_task
+
+            task_presence = task_utils.extract_task_presence(
+                event_labels=task_data["event_labels"],
+                TR_task=TR_task,
+                TR_mri=task_data["TR_mri"],
+                TR_array=TR_array,
+                binary=True,
+            )
+
+            X_new = dFC_vecs
+            y_new = task_presence.ravel()
+
+            # concat current TR and two TR before of X_new to predict the current TR of y_new
+            # ignore the edge case of the first two TRs
+            X_new = np.concatenate(
+                (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1
+            )
+            X_new = X_new[2:, :]
+            y_new = y_new[2:]
+
+            if subj in train_subjects:
+                subj_label_train.extend([subj for i in range(X_new.shape[0])])
+                if X_train is None and y_condition_train is None:
+                    X_train = X_new
+                    y_condition_train = y_new
+                else:
+                    X_train = np.concatenate((X_train, X_new), axis=0)
+                    y_condition_train = np.concatenate((y_condition_train, y_new), axis=0)
+            elif subj in test_subjects:
+                subj_label_test.extend([subj for i in range(X_new.shape[0])])
+                if X_test is None and y_condition_test is None:
+                    X_test = X_new
+                    y_condition_test = y_new
+                else:
+                    X_test = np.concatenate((X_test, X_new), axis=0)
+                    y_condition_test = np.concatenate((y_condition_test, y_new), axis=0)
+
+        print(
+            X_train.shape, X_test.shape, y_condition_train.shape, y_condition_test.shape
+        )
+        subj_label_train = np.array(subj_label_train)
+        subj_label_test = np.array(subj_label_test)
+        print(subj_label_train.shape, subj_label_test.shape)
+
+        # task presence classification
+
+        print("task presence classification ...")
+
+        pca = PCA(svd_solver="full", whiten=False)
+        pca.fit(X_train)
+        num_PCs = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1
+
+        # create new a knn model
+        knn = KNeighborsClassifier()
+        # create a dictionary of all values we want to test for n_neighbors
+        param_grid = {"n_neighbors": np.arange(1, 30)}
+        # use gridsearch to test all values for n_neighbors
+        knn_gscv = GridSearchCV(knn, param_grid, cv=5)
+        # fit model to data
+        knn_gscv.fit(X_train, y_condition_train)
+
+        n_neighbors = knn_gscv.best_params_["n_neighbors"]
+
+        neigh = make_pipeline(
+            StandardScaler(),
+            PCA(n_components=num_PCs),
+            KNeighborsClassifier(n_neighbors=n_neighbors),
+        ).fit(X_train, y_condition_train)
+
+        ML_RESULT[task] = {
+            "pca": pca,
+            "num_PCs": num_PCs,
+            "cv_results": knn_gscv.cv_results_,
+            "KNN": neigh,
+            "KNN train score": neigh.score(X_train, y_condition_train),
+            "KNN test score": neigh.score(X_test, y_condition_test),
+        }
+
+        print(
+            f"KNN train score {dFC.measure.measure_name} {task}: {neigh.score(X_train, y_condition_train)}"
+        )
+        print(
+            f"KNN test score {dFC.measure.measure_name} {task}: {neigh.score(X_test, y_condition_test)}"
+        )
+
+        # measure pred score on each subj
+
+        for subj in SUBJECTS:
+            ML_scores["subj_id"].append(subj)
+            if subj in train_subjects:
+                ML_scores["group"].append("train")
+                features = X_train[subj_label_train == subj, :]
+                target = y_condition_train[subj_label_train == subj]
+            elif subj in test_subjects:
+                ML_scores["group"].append("test")
+                features = X_test[subj_label_test == subj, :]
+                target = y_condition_test[subj_label_test == subj]
+
+            pred = neigh.predict(features)
+
+            ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
+
+            ML_scores["task"].append(task)
+            ML_scores["dFC method"].append(dFC.measure.measure_name)
+
+    folder = f"{output_root}"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    np.save(f"{folder}/ML_RESULT_{dFC.measure.measure_name}.npy", ML_RESULT)
+
+np.save(f"{folder}/task_features_KNN_classify.npy", task_features)
+np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores)

From eb69b24cbacbc95dc9a5e5020eed43f039792ebe Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 9 Apr 2024 15:01:56 -0400
Subject: [PATCH 004/401] update plot_task_dfc

---
 pydfc/task_utils.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 982a2d8..3fe2870 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -59,11 +59,11 @@ def events_time_to_labels(
 ################################# Visualization Functions ####################################
 
 
-def plot_task_dFC(task_labels, dFC_lst, event_types, Fs_mri, TR_step=12):
+def plot_task_dFC(task_presence, dFC_lst, Fs_mri, TR_step=12):
     """
-    task_labels: numpy array of shape (num_time_task, num_event_types) containing the event or task labels
-    this function assumes that the task data has the same Fs as the dFC data, i.e. MRI data
-    and that the time points of the task data are aligned with the time points of the dFC data
+    task_presence: numpy array containing the task presence in the time points of the dFC data
+    this function assumes that the task presence has the same Fs as the dFC data, i.e. MRI data
+    and that the time points of the task presence are aligned with the time points of the dFC data
     """
     conn_mat_size = 20
     scale_task_plot = 20
@@ -73,12 +73,8 @@ def plot_task_dFC(task_labels, dFC_lst, event_types, Fs_mri, TR_step=12):
 
     ax = plt.gca()
 
-    time = np.arange(0, task_labels.shape[0]) / Fs_mri
-    for i in range(0, task_labels.shape[1]):
-        ax.plot(
-            time, task_labels[:, i] * scale_task_plot, label=event_types[i], linewidth=4
-        )
-    plt.legend()
+    time = np.arange(0, task_presence.shape[0]) / Fs_mri
+    ax.plot(time, task_presence * scale_task_plot, linewidth=4)
     plt.xlabel("Time (s)")
 
     comman_TRs = TR_intersection(dFC_lst)

From 839928f30e8dcd747a6f28817fc4c7a4bb44d22b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Apr 2024 13:42:43 -0400
Subject: [PATCH 005/401] add two TRs after for ML

---
 simul_dFC/KNN_ML.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py
index bf1a6c9..4de7f76 100644
--- a/simul_dFC/KNN_ML.py
+++ b/simul_dFC/KNN_ML.py
@@ -147,13 +147,28 @@
             X_new = dFC_vecs
             y_new = task_presence.ravel()
 
-            # concat current TR and two TR before of X_new to predict the current TR of y_new
-            # ignore the edge case of the first two TRs
+            # # concat current TR and two TR before of X_new to predict the current TR of y_new
+            # # ignore the edge case of the first two TRs
+            # X_new = np.concatenate(
+            #     (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1
+            # )
+            # X_new = X_new[2:, :]
+            # y_new = y_new[2:]
+
+            # concat current TR and two TR before and after of X_new to predict the current TR of y_new
+            # ignore the edge case of the first and last two TRs
             X_new = np.concatenate(
-                (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1
+                (
+                    X_new,
+                    np.roll(X_new, 1, axis=0),
+                    np.roll(X_new, 2, axis=0),
+                    np.roll(X_new, -1, axis=0),
+                    np.roll(X_new, -2, axis=0),
+                ),
+                axis=1,
             )
-            X_new = X_new[2:, :]
-            y_new = y_new[2:]
+            X_new = X_new[2:-2, :]
+            y_new = y_new[2:-2]
 
             if subj in train_subjects:
                 subj_label_train.extend([subj for i in range(X_new.shape[0])])

From c8d417bfdb250a51c8c82211ba20a2fe6090a772 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 18 Apr 2024 19:04:57 -0400
Subject: [PATCH 006/401] add dynamic_pred param

---
 simul_dFC/KNN_ML.py | 47 ++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py
index 4de7f76..e2848f3 100644
--- a/simul_dFC/KNN_ML.py
+++ b/simul_dFC/KNN_ML.py
@@ -22,6 +22,7 @@
 
 TASKS = ["task-pulse"]
 
+dynamic_pred = "no"  # 'past' or 'past_and_future' or 'no' (only current TR)
 normalize_dFC = True
 
 SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
@@ -147,28 +148,30 @@
             X_new = dFC_vecs
             y_new = task_presence.ravel()
 
-            # # concat current TR and two TR before of X_new to predict the current TR of y_new
-            # # ignore the edge case of the first two TRs
-            # X_new = np.concatenate(
-            #     (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1
-            # )
-            # X_new = X_new[2:, :]
-            # y_new = y_new[2:]
-
-            # concat current TR and two TR before and after of X_new to predict the current TR of y_new
-            # ignore the edge case of the first and last two TRs
-            X_new = np.concatenate(
-                (
-                    X_new,
-                    np.roll(X_new, 1, axis=0),
-                    np.roll(X_new, 2, axis=0),
-                    np.roll(X_new, -1, axis=0),
-                    np.roll(X_new, -2, axis=0),
-                ),
-                axis=1,
-            )
-            X_new = X_new[2:-2, :]
-            y_new = y_new[2:-2]
+            if dynamic_pred == "past":
+                # concat current TR and two TR before of X_new to predict the current TR of y_new
+                # ignore the edge case of the first two TRs
+                X_new = np.concatenate(
+                    (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1
+                )
+                X_new = X_new[2:, :]
+                y_new = y_new[2:]
+
+            elif dynamic_pred == "past_and_future":
+                # concat current TR and two TR before and after of X_new to predict the current TR of y_new
+                # ignore the edge case of the first and last two TRs
+                X_new = np.concatenate(
+                    (
+                        X_new,
+                        np.roll(X_new, 1, axis=0),
+                        np.roll(X_new, 2, axis=0),
+                        np.roll(X_new, -1, axis=0),
+                        np.roll(X_new, -2, axis=0),
+                    ),
+                    axis=1,
+                )
+                X_new = X_new[2:-2, :]
+                y_new = y_new[2:-2]
 
             if subj in train_subjects:
                 subj_label_train.extend([subj for i in range(X_new.shape[0])])

From 67e176d78737281b5c2c50028fd1bb8f0f913dd2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 19 Apr 2024 18:08:51 -0400
Subject: [PATCH 007/401] correct make_pipeline KNN_ML

---
 simul_dFC/KNN_ML.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py
index e2848f3..44eca0a 100644
--- a/simul_dFC/KNN_ML.py
+++ b/simul_dFC/KNN_ML.py
@@ -201,20 +201,25 @@
 
         print("task presence classification ...")
 
+        # find num_PCs
         pca = PCA(svd_solver="full", whiten=False)
         pca.fit(X_train)
         num_PCs = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1
 
-        # create new a knn model
-        knn = KNeighborsClassifier()
+        # create a pipeline with a knn model to find the best n_neighbors
+        knn = make_pipeline(
+            StandardScaler(),
+            PCA(n_components=num_PCs),
+            KNeighborsClassifier(),
+        )
         # create a dictionary of all values we want to test for n_neighbors
-        param_grid = {"n_neighbors": np.arange(1, 30)}
+        param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
         # use gridsearch to test all values for n_neighbors
         knn_gscv = GridSearchCV(knn, param_grid, cv=5)
         # fit model to data
         knn_gscv.fit(X_train, y_condition_train)
 
-        n_neighbors = knn_gscv.best_params_["n_neighbors"]
+        n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
 
         neigh = make_pipeline(
             StandardScaler(),

From 472d427ed139fd9bcd5ec0d3561f5dad120adcc1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 23 Apr 2024 13:08:38 -0400
Subject: [PATCH 008/401] find events columns idx

---
 pydfc/task_utils.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 3fe2870..a24b3cf 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -6,6 +6,8 @@
 @author: Mohammad Torabi
 """
 
+import warnings
+
 import matplotlib.pyplot as plt
 import numpy as np
 from nilearn import glm
@@ -25,15 +27,21 @@ def events_time_to_labels(
     It assumes that the first time point is TR0 which corresponds to [0 sec, TR sec] interval.
     oversampling: number of samples per TR_mri to improve the time resolution of tasks
     """
+
+    # find which column is the "onset" in the first row
+    onset_idx = np.where(events[0, :] == "onset")[0][0]
+    duration_idx = np.where(events[0, :] == "duration")[0][0]
+    trial_type_idx = np.where(events[0, :] == "trial_type")[0][0]
+
     assert (
-        events[0, 0] == "onset"
-    ), "The first column of the events file should be the onset!"
+        events[0, onset_idx] == "onset"
+    ), "Something went wrong with the events file! The onset column was not found!"
     assert (
-        events[0, 1] == "duration"
-    ), "The second column of the events file should be the duration!"
+        events[0, duration_idx] == "duration"
+    ), "Something went wrong with the events file! The duration column was not found!"
     assert (
-        events[0, 2] == "trial_type"
-    ), "The third column of the events file should be the trial type!"
+        events[0, trial_type_idx] == "trial_type"
+    ), "Something went wrong with the events file! The trial_type column was not found!"
 
     Fs = float(1 / TR_mri) * oversampling
     num_time_task = int(num_time_mri * oversampling)
@@ -43,12 +51,16 @@ def events_time_to_labels(
         if i == 0:
             continue
 
-        if events[i, 2] in event_types:
-            start_time = float(events[i, 0])
-            end_time = float(events[i, 0]) + float(events[i, 1])
+        if events[i, trial_type_idx] in event_types:
+            if events[i, trial_type_idx] == "rest":
+                warnings.warn("trial types should not include 'rest'")
+            start_time = float(events[i, onset_idx])
+            end_time = float(events[i, onset_idx]) + float(events[i, duration_idx])
             start_timepoint = int(np.rint(start_time * Fs))
             end_timepoint = int(np.rint(end_time * Fs))
-            event_labels[start_timepoint:end_timepoint] = event_types.index(events[i, 2])
+            event_labels[start_timepoint:end_timepoint] = event_types.index(
+                events[i, trial_type_idx]
+            )
 
     if return_0_1:
         event_labels = np.multiply(event_labels != 0, 1)

From daa3e6bf89cc4cac0c3f8cd7b16cbbc250746b51 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 26 Apr 2024 15:40:30 -0400
Subject: [PATCH 009/401] add simul_utils

---
 pydfc/__init__.py                |   2 +
 pydfc/simul_utils.py             | 331 +++++++++++++++++++++++++++++++
 simul_dFC/task_data_simulator.py | 231 ++++++++++-----------
 task_dFC/nifti_to_roi_signal.py  | 304 +++++++++++++++++-----------
 4 files changed, 625 insertions(+), 243 deletions(-)
 create mode 100644 pydfc/simul_utils.py

diff --git a/pydfc/__init__.py b/pydfc/__init__.py
index d5ac722..c793222 100644
--- a/pydfc/__init__.py
+++ b/pydfc/__init__.py
@@ -27,4 +27,6 @@
     "dfc_methods",
     "dfc_utils",
     "comparison",
+    "task_utils",
+    "simul_utils",
 ]
diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
new file mode 100644
index 0000000..109aac2
--- /dev/null
+++ b/pydfc/simul_utils.py
@@ -0,0 +1,331 @@
+# -*- coding: utf-8 -*-
+"""
+Functions to facilitate dFC simulation.
+
+Created on April 25 2024
+@author: Mohammad Torabi
+"""
+
+import re
+from calendar import c
+
+import numpy as np
+from matplotlib.pylab import rand
+from tvb.simulator.lab import *
+
+from pydfc import TIME_SERIES, task_utils
+
+################################# Simulation Functions ####################################
+
+
+def create_random_stimulus_weights(stimulated_regions_list, n_regions=76):
+    """
+    Create random stimulus weights for the stimulated regions.
+    """
+    rand_weighting = [
+        np.random.normal(loc=2.0 ** (-1 * (2 + i)), scale=0.1 * (2.0**-2))
+        for i in range(len(stimulated_regions_list))
+    ]
+
+    # configure stimulus spatial pattern
+    weighting = np.zeros((n_regions,))
+    weighting[stimulated_regions_list] = rand_weighting
+
+    return weighting
+
+
+def create_stimulus(
+    onset,
+    task_duration,
+    task_block_duration,
+    conn,
+    region_weighting,
+):
+    """
+    Create a stimulus pattern for the task.
+    """
+    # temporal profile
+    eqn_t = equations.PulseTrain()
+    eqn_t.parameters["onset"] = onset * 1e3  # ms
+    eqn_t.parameters["tau"] = task_duration * 1e3  # ms
+    eqn_t.parameters["T"] = task_block_duration * 1e3  # ms
+
+    stimulus = patterns.StimuliRegion(
+        temporal=eqn_t, connectivity=conn, weight=region_weighting
+    )
+
+    return stimulus
+
+
+def simulate_task_BOLD(
+    onset_time,
+    task_duration,
+    task_block_duration,
+    sim_length,
+    BOLD_period,
+    TAVG_period,
+    global_conn_coupling_coef=0.0126,
+    D=0.001,
+    conn_speed=1.0,
+    dt=0.5,
+    drop_initial_time=False,
+):
+    """
+    Simulate BOLD signal for a task.
+    """
+    # randomize some parameters for each subjects
+    onset = np.random.normal(loc=onset_time, scale=0.5)  # seconds
+    global_conn_coupling = np.random.normal(loc=global_conn_coupling_coef, scale=0.0075)
+    conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed)
+    ################################# Initialize Simulation ####################################
+    conn = connectivity.Connectivity.from_file()
+    conn.speed = np.array([conn_speed_rand])
+
+    # configure stimulus spatial pattern
+    weighting = create_random_stimulus_weights(
+        stimulated_regions_list=[0, 7, 13, 33, 42], n_regions=76
+    )
+
+    stimulus = create_stimulus(
+        onset=onset,
+        task_duration=task_duration,
+        task_block_duration=task_block_duration,
+        conn=conn,
+        region_weighting=weighting,
+    )
+
+    ################################# Run Simulation ####################################
+
+    # set the global coupling strength
+    # you can switch between deterministic (without noise) and stochastic integration (with noise)
+    sim = simulator.Simulator(
+        model=models.Generic2dOscillator(a=np.array([0.5])),
+        connectivity=conn,
+        coupling=coupling.Linear(a=np.array([global_conn_coupling])),
+        # integrator=integrators.HeunDeterministic(dt=dt),
+        integrator=integrators.HeunStochastic(
+            dt=dt, noise=noise.Additive(nsig=np.array([D]))
+        ),
+        monitors=(
+            monitors.TemporalAverage(period=TAVG_period),
+            monitors.Bold(period=BOLD_period, hrf_kernel=equations.MixtureOfGammas()),
+            monitors.ProgressLogger(period=10e3),
+        ),
+        stimulus=stimulus,
+        simulation_length=sim_length,
+    ).configure()
+
+    (tavg_time, tavg_data), (bold_time, bold_data), _ = sim.run()
+
+    if drop_initial_time:
+        # truncate the first 10 seconds of the simulation
+        # to avoid transient effects
+        truncate_time = 10e3  # in m sec
+        bold_truncate_idx = int(truncate_time / BOLD_period)
+        bold_time = bold_time[bold_truncate_idx:]
+        bold_data = bold_data[bold_truncate_idx:]
+        tavg_truncate_idx = int(truncate_time / TAVG_period)
+        tavg_time = tavg_time[tavg_truncate_idx:]
+        tavg_data = tavg_data[tavg_truncate_idx:]
+
+    centres_locs = conn.centres
+    region_labels = list(conn.region_labels)
+    TR_mri = BOLD_period * 1e-3  # in seconds
+
+    bold_data = bold_data[:, 0, :, 0]
+    # change time_series.shape to (roi, time)
+    bold_data = bold_data.T
+
+    TAVG_data = tavg_data[:, 0, :, 0]
+    # change time_series.shape to (roi, time)
+    TAVG_data = TAVG_data.T
+
+    return (
+        bold_data,
+        bold_time,
+        region_labels,
+        centres_locs,
+        TR_mri,
+        TAVG_data,
+        tavg_time,
+        TAVG_period,
+    )
+
+
+def create_simul_task_info(
+    num_time_mri,
+    TR_mri,
+    task,
+    onset,
+    task_duration,
+    task_block_duration,
+    sim_length,
+    oversampling=50,
+):
+    """
+    Create a dictionary containing the task data for simulation.
+
+    Parameters
+    ----------
+    num_time_mri : int
+        Number of time points in the BOLD signal.
+    TR_mri : float
+        The repetition time of the MRI.
+    task : str
+        The task name.
+    onset : float
+        The onset time of the task.
+    task_duration : float
+        The duration of the task.
+    task_block_duration : float
+        The duration of the task block.
+    sim_length : float
+        The length of the simulation.
+    oversampling : int, optional
+        The oversampling factor. The default is 50.
+        generate more samples per TR than the func data to have a
+        better event_labels time resolution
+    """
+    ################################# EXTRACT TASK LABELS #########################
+    events = []
+    event_types = ["rest", "task"]
+
+    # using onset, task_duration, task_block_duration to create the events
+    events.append(["onset", "duration", "trial_type"])
+    t = onset
+    while t < sim_length:
+        events.append([t, task_duration, "task"])
+        t += task_block_duration
+    events = np.array(events)
+
+    event_labels, Fs_task = task_utils.events_time_to_labels(
+        events=events,
+        TR_mri=TR_mri,
+        num_time_mri=num_time_mri,
+        event_types=event_types,
+        oversampling=oversampling,
+        return_0_1=False,
+    )
+    # fill task labels with 0 (rest) and 1 (task's index, here only 1 task is used)
+    task_labels = np.multiply(event_labels != 0, 1)
+    ################################# SAVE #################################
+    # save the ROI time series and task data
+    task_data = {
+        "task": task,
+        "task_labels": task_labels,
+        "event_labels": event_labels,
+        "event_types": event_types,
+        "events": events,
+        "Fs_task": Fs_task,
+        "TR_mri": TR_mri,
+        "num_time_mri": num_time_mri,
+    }
+
+    return task_data
+
+
+def simulate_task_BOLD_TS(
+    subj_id,
+    task,
+    onset_time,
+    task_duration,
+    task_block_duration,
+    sim_length,
+    BOLD_period,
+    TAVG_period,
+    global_conn_coupling_coef=0.0126,
+    D=0.001,
+    conn_speed=1.0,
+    dt=0.5,
+    drop_initial_time=False,
+):
+    """
+    Simulate BOLD signal for a task and return a TIME_SERIES object.
+    """
+    bold_data, bold_time, region_labels, centres_locs, TR_mri, _, _, _ = (
+        simulate_task_BOLD(
+            onset_time=onset_time,
+            task_duration=task_duration,
+            task_block_duration=task_block_duration,
+            sim_length=sim_length,
+            BOLD_period=BOLD_period,
+            TAVG_period=TAVG_period,
+            global_conn_coupling_coef=global_conn_coupling_coef,
+            D=D,
+            conn_speed=conn_speed,
+            dt=dt,
+            drop_initial_time=drop_initial_time,
+        )
+    )
+    time_series = TIME_SERIES(
+        data=bold_data,
+        subj_id=subj_id,
+        Fs=1 / TR_mri,
+        locs=centres_locs,
+        node_labels=region_labels,
+        TS_name=f"BOLD_{subj_id}_{task}",
+        session_name=task,
+    )
+    num_time_mri = time_series.n_time
+    task_data = create_simul_task_info(
+        num_time_mri=num_time_mri,
+        TR_mri=TR_mri,
+        task=task,
+        onset=onset_time,
+        task_duration=task_duration,
+        task_block_duration=task_block_duration,
+        sim_length=sim_length,
+    )
+
+    return time_series, task_data
+
+
+def simulate_task(subj_id, task_info):
+    """
+    Simulate task-based BOLD signal for a subject.
+
+    Parameters
+    ----------
+    subj_id : int
+        The subject ID.
+    task_info : dict
+        A dictionary containing the task information below:
+            - task_name: str
+                The name of the task.
+            - onset_time: float
+                The onset time of the task.
+            - task_duration: float
+                The duration of the task.
+            - task_block_duration: float
+                The duration of the task block.
+            - sim_length: float
+                The length of the simulation.
+            - BOLD_period: float
+                The BOLD period.
+            - TAVG_period: float
+                The TAVG period.
+            - global_conn_coupling_coef: float
+                The global connectivity coupling coefficient.
+            - D: float
+                The noise parameter.
+            - conn_speed: float
+                The connectivity speed.
+            - dt: float
+                The simulation time step.
+    """
+    time_series, task_data = simulate_task(
+        subj_id=subj_id,
+        task=task_info["task_name"],
+        onset_time=task_info["onset_time"],
+        task_duration=task_info["task_duration"],
+        task_block_duration=task_info["task_block_duration"],
+        sim_length=task_info["sim_length"],
+        BOLD_period=task_info["BOLD_period"],
+        TAVG_period=task_info["TAVG_period"],
+        global_conn_coupling_coef=task_info["global_conn_coupling_coef"],
+        D=task_info["D"],
+        conn_speed=task_info["conn_speed"],
+        dt=task_info["dt"],
+    )
+
+    return time_series, task_data
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 98fa832..f6bc3a9 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -10,7 +10,7 @@
 import numpy as np
 from tvb.simulator.lab import *
 
-from pydfc import TIME_SERIES, task_utils
+from pydfc import simul_utils
 
 warnings.simplefilter("ignore")
 
@@ -20,20 +20,17 @@
 ################################# Parameters ####################################
 
 # data paths
-dataset = "ds000002"
+dataset = "ds000001"
 # main_root = f"./DATA/{dataset}" # for local
 main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
 output_root = f"{main_root}/derivatives/ROI_timeseries"
 
-task = "task-pulse"
-
 # simulation parameters
 sim_length = 250e3  # in m sec
 onset_time = 20.0  # in seconds
-task_duration = 12.0  # in seconds
-task_block_duration = 30.0  # in seconds
 BOLD_period = 500  # in m sec
 TAVG_period = 1.0  # in m sec
+global_conn_coupling_coef = 0.0126
 conn_speed = 1.0
 D = 0.001  # noise dispersion
 dt = 0.5  # integration step
@@ -47,130 +44,110 @@
 
 print(f"subject-level simulation started running ... for subject: {subj_id} ...")
 
-# randomize some parameters for each subjects
-onset = np.random.normal(loc=onset_time, scale=0.5)  # seconds
-global_conn_coupling = np.random.normal(loc=0.0126, scale=0.0075)
-rand_weighting = np.array(
-    [
-        np.random.normal(loc=2.0**-2, scale=0.1 * (2.0**-2)),
-        np.random.normal(loc=2.0**-3, scale=0.1 * (2.0**-3)),
-        np.random.normal(loc=2.0**-4, scale=0.1 * (2.0**-4)),
-        np.random.normal(loc=2.0**-5, scale=0.1 * (2.0**-5)),
-        np.random.normal(loc=2.0**-6, scale=0.1 * (2.0**-6)),
-    ]
-)
-conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed)
-################################# Initialize Simulation ####################################
-conn = connectivity.Connectivity.from_file()
-conn.speed = np.array([conn_speed_rand])
-
-# configure stimulus spatial pattern
-weighting = np.zeros((76,))
-weighting[[0, 7, 13, 33, 42]] = rand_weighting
-# weighting[[0, 7, 13, 33, 42]] = numpy.array([2.0 ** -2, 2.0 ** -3, 2.0 ** -4, 2.0 ** -5, 2.0 ** -6])
-
-# temporal profile
-eqn_t = equations.PulseTrain()
-eqn_t.parameters["onset"] = onset * 1e3  # ms
-eqn_t.parameters["tau"] = task_duration * 1e3  # ms
-eqn_t.parameters["T"] = task_block_duration * 1e3  # ms
-
-stimulus = patterns.StimuliRegion(temporal=eqn_t, connectivity=conn, weight=weighting)
-
-################################# Run Simulation ####################################
-
-# set the global coupling strength
-# you can switch between deterministic (without noise) and stochastic integration (with noise)
-sim = simulator.Simulator(
-    model=models.Generic2dOscillator(a=np.array([0.5])),
-    connectivity=conn,
-    coupling=coupling.Linear(a=np.array([global_conn_coupling])),
-    # integrator=integrators.HeunDeterministic(dt=dt),
-    integrator=integrators.HeunStochastic(
-        dt=dt, noise=noise.Additive(nsig=np.array([D]))
-    ),
-    monitors=(
-        monitors.TemporalAverage(period=TAVG_period),
-        monitors.Bold(period=BOLD_period, hrf_kernel=equations.MixtureOfGammas()),
-        monitors.ProgressLogger(period=10e3),
-    ),
-    stimulus=stimulus,
-    simulation_length=sim_length,
-).configure()
-
-(tavg_time, tavg_data), (bold_time, bold_data), _ = sim.run()
-
-# # truncate the first 10 seconds of the simulation
-# # to avoid transient effects
-# truncate_time = 10e3 # in m sec
-# bold_truncate_idx = int(truncate_time / BOLD_period)
-# bold_time = bold_time[bold_truncate_idx:]
-# bold_data = bold_data[bold_truncate_idx:]
-# tavg_truncate_idx = int(truncate_time / TAVG_period)
-# tavg_time = tavg_time[tavg_truncate_idx:]
-# tavg_data = tavg_data[tavg_truncate_idx:]
-
-centres_locs = conn.centres
-region_labels = list(conn.region_labels)
-TR_mri = BOLD_period * 1e-3  # in seconds
-
-bold_data = bold_data[:, 0, :, 0]
-# change time_series.shape to (roi, time)
-bold_data = bold_data.T
-
-time_series = TIME_SERIES(
-    data=bold_data,
-    subj_id=subj_id,
-    Fs=1 / TR_mri,
-    locs=centres_locs,
-    node_labels=region_labels,
-    TS_name=f"BOLD_{subj_id}_{task}",
-    session_name=task,
-)
-num_time_mri = time_series.n_time
-################################# EXTRACT TASK LABELS #########################
-oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
+all_task_info = {
+    "task-midFreqMidRest": {
+        "task_name": "task-midFreqMidRest",
+        "onset_time": onset_time,
+        "task_duration": 12.0,
+        "task_block_duration": 30.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-lowFreqLongRest": {
+        "task_name": "task-lowFreqLongRest",
+        "onset_time": onset_time,
+        "task_duration": 20.0,
+        "task_block_duration": 40.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-lowFreqShortRest": {
+        "task_name": "task-lowFreqShortRest",
+        "onset_time": onset_time,
+        "task_duration": 20.0,
+        "task_block_duration": 25.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-lowFreqShortTask": {
+        "task_name": "task-lowFreqShortTask",
+        "onset_time": onset_time,
+        "task_duration": 5.0,
+        "task_block_duration": 30.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-highFreqLongRest": {
+        "task_name": "task-highFreqLongRest",
+        "onset_time": onset_time,
+        "task_duration": 1.0,
+        "task_block_duration": 5.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-highFreqShortRest": {
+        "task_name": "task-highFreqShortRest",
+        "onset_time": onset_time,
+        "task_duration": 4.0,
+        "task_block_duration": 5.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-midFreqMidRestNoisy": {
+        "task_name": "task-midFreqMidRestNoisy",
+        "onset_time": onset_time,
+        "task_duration": 12.0,
+        "task_block_duration": 30.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D * 100,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+}
 
-events = []
-event_types = ["rest", "task"]
-TASKS = [task]
+for task in all_task_info:
 
-# using onset, task_duration, task_block_duration to create the events
-events.append(["onset", "duration", "trial_type"])
-t = onset
-while t < sim_length:
-    events.append([t, task_duration, "task"])
-    t += task_block_duration
-events = np.array(events)
+    time_series, task_data = simul_utils.simulate_task(subj_id, all_task_info[task])
 
-event_labels, Fs_task = task_utils.events_time_to_labels(
-    events=events,
-    TR_mri=TR_mri,
-    num_time_mri=num_time_mri,
-    event_types=event_types,
-    oversampling=oversampling,
-    return_0_1=False,
-)
-# fill task labels with 0 (rest) and 1 (task's index, here only 1 task is used)
-task_labels = np.multiply(event_labels != 0, 1)
-################################# SAVE #################################
-# save the ROI time series and task data
-task_data = {
-    "task": task,
-    "task_labels": task_labels,
-    "task_types": TASKS,
-    "event_labels": event_labels,
-    "event_types": event_types,
-    "events": events,
-    "Fs_task": Fs_task,
-    "TR_mri": TR_mri,
-    "num_time_mri": num_time_mri,
-}
-subj_folder = f"{subj_id}_{task}"
-if not os.path.exists(f"{output_root}/{subj_folder}/"):
-    os.makedirs(f"{output_root}/{subj_folder}/")
-np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
-np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
+    # save the time series and task data
+    subj_folder = f"{subj_id}_{task}"
+    if not os.path.exists(f"{output_root}/{subj_folder}/"):
+        os.makedirs(f"{output_root}/{subj_folder}/")
+    np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
+    np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
 
 print("****************** DONE ******************")
 ####################################################################################
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 1e52cb8..0e20700 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import os
 import warnings
@@ -6,130 +7,201 @@
 
 from pydfc import data_loader, task_utils
 
-warnings.simplefilter("ignore")
-
-################################# Parameters #################################
-# data paths
-# main_root = '../../DATA/ds002785' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785"  # for server
-fmriprep_root = f"{main_root}/derivatives/fmriprep"
-output_root = f"{main_root}/derivatives/ROI_timeseries"
-
-bold_suffix = "_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz"
-
-# for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
-
-# find all subjects
-ALL_SUBJs = os.listdir(fmriprep_root)
-ALL_SUBJs = [i for i in ALL_SUBJs if ("sub-" in i) and (not ".html" in i)]
-ALL_SUBJs.sort()
-
-# pick the subject
-job_id = int(os.getenv("SGE_TASK_ID"))
-subj = ALL_SUBJs[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
-
-print(
-    f"subject-level ROI signal extraction CODE started running ... for subject: {subj} ..."
-)
-################################# FIND THE FUNC FILE #################################
-for task in TASKS:
+# warnings.simplefilter("ignore")
+
+
+################################# FUNCTIONS #################################
+def run_roi_signal_extraction(
+    subj, task, main_root, fmriprep_root, bold_suffix, output_root
+):
+    """
+    Extract ROI signals and task labels for a given subject and task
+    """
     # find the func file for this subject and task
     ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
     ALL_TASK_FILES = [
-        i for i in ALL_TASK_FILES if (bold_suffix in i) and (task in i)
+        file_i
+        for file_i in ALL_TASK_FILES
+        if (bold_suffix in file_i) and (task in file_i)
     ]  # only keep the denoised files? or use the original files?
-    # print(ALL_TASK_FILES)
-    if not len(ALL_TASK_FILES) == 1:
+
+    if not len(ALL_TASK_FILES) >= 1:
         # if the func file is not found, exclude the subject
         print("Func file not found for " + subj + " " + task)
-        continue
-    fmriprep_file = f"{fmriprep_root}/{subj}/func/{ALL_TASK_FILES[0]}"
-    info_file = (
-        f"{main_root}/{subj}/func/{ALL_TASK_FILES[0].replace(bold_suffix, '_bold.json')}"
-    )
+        return
+
+    # there might be multiple runs for the same task
+    # check if "_run" exists in all the task file names
+    if all(["_run" in task_file for task_file in ALL_TASK_FILES]):
+        multi_run_flag = True
+        # find all the runs
+        RUNS = [
+            task_file[
+                task_file.find("_run")
+                + 1 : task_file.find("_run")
+                + 1
+                + task_file[task_file.find("_run") + 1 :].find("_")
+            ]
+            for task_file in ALL_TASK_FILES
+        ]
+        # sort
+        RUNS.sort()
+        print(f"Found multiple runs for {subj} {task}: {RUNS}")
+    else:
+        multi_run_flag = False
+        RUNS = [""]
+
+    for run in RUNS:
+        task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0]
+        nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}"
+        info_file = (
+            f"{main_root}/bids/{subj}/func/{task_file.replace(bold_suffix, '_bold.json')}"
+        )
+
+        ################################# LOAD JSON INFO #########################
+        # Opening JSON file as a dictionary
+        f = open(info_file)
+        acquisition_data = json.load(f)
+        f.close()
+        TR_mri = acquisition_data["RepetitionTime"]
+        ################################# EXTRACT TIME SERIES #########################
+        # extract ROI signals and convert to TIME_SERIES object
+        time_series = data_loader.nifti2timeseries(
+            nifti_file=nifti_file,
+            n_rois=100,
+            Fs=1 / TR_mri,
+            subj_id=subj,
+            confound_strategy="no_motion",
+            standardize="zscore",
+            TS_name="BOLD",
+            session=task,
+        )
+        num_time_mri = time_series.n_time
+        ################################# EXTRACT TASK LABELS #########################
+        oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
+        if task == "task-restingstate":
+            events = []
+            event_types = ["rest"]
+            event_labels = np.zeros((int(num_time_mri * oversampling), 1))
+            task_labels = np.zeros((int(num_time_mri * oversampling), 1))
+            Fs_task = float(1 / TR_mri) * oversampling
+        else:
+            task_events_root = f"{main_root}/bids/{subj}/func"
+            ALL_EVENTS_FILES = os.listdir(task_events_root)
+            ALL_EVENTS_FILES = [
+                file_i
+                for file_i in ALL_EVENTS_FILES
+                if (subj in file_i)
+                and (task in file_i)
+                and (run in file_i)
+                and ("events.tsv" in file_i)
+            ]
+            if not len(ALL_EVENTS_FILES) == 1:
+                # if the events file is not found, exclude the subject
+                print(f"Events file not found for {subj} {task} {run}")
+                return
+            # load the tsv events file
+            events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
+            events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
+            # get the event labels
+            # check that "rest" does not already exist in the event types
+            if any(
+                ["rest" in event_type for event_type in list(np.unique(events[1:, 2]))]
+            ):
+                raise ValueError("Event types should not include 'rest'")
+            event_types = ["rest"] + list(np.unique(events[1:, 2]))
+            event_labels, Fs_task = task_utils.events_time_to_labels(
+                events=events,
+                TR_mri=TR_mri,
+                num_time_mri=num_time_mri,
+                event_types=event_types,
+                oversampling=oversampling,
+                return_0_1=False,
+            )
+            # fill task labels with task's index
+            task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(
+                task
+            )
+        ################################# SAVE #################################
+        # save the ROI time series and task data
+        task_data = {
+            "task": task,
+            "task_labels": task_labels,
+            "task_types": TASKS,
+            "event_labels": event_labels,
+            "event_types": event_types,
+            "events": events,
+            "Fs_task": Fs_task,
+            "TR_mri": TR_mri,
+            "num_time_mri": num_time_mri,
+        }
+        if multi_run_flag:
+            output_file_prefix = f"{subj}_{task}_{run}"
+        else:
+            output_file_prefix = f"{subj}_{task}"
+        if not os.path.exists(f"{output_root}/{subj}/"):
+            os.makedirs(f"{output_root}/{subj}/")
+        np.save(f"{output_root}/{subj}/{output_file_prefix}_time-series.npy", time_series)
+        np.save(f"{output_root}/{subj}/{output_file_prefix}_task-data.npy", task_data)
+
+
+########################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to convert nifti files to ROI signals for a given participant.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+    parser.add_argument("--participant_id", type=str, help="participant id")
 
-    ################################# LOAD JSON INFO #########################
-    # Opening JSON file as a dictionary
-    f = open(info_file)
-    acquisition_data = json.load(f)
-    f.close()
-    TR_mri = acquisition_data["RepetitionTime"]
-    ################################# EXTRACT TIME SERIES #########################
-    # extract ROI signals and convert to TIME_SERIES object
-    time_series = data_loader.nifti2timeseries(
-        nifti_file=fmriprep_file,
-        n_rois=100,
-        Fs=1 / TR_mri,
-        subj_id=subj,
-        confound_strategy="no_motion",
-        standardize="zscore",
-        TS_name="BOLD",
-        session=task,
+    args = parser.parse_args()
+
+    dataset_info_file = args.dataset_info
+    participant_id = args.participant_id
+
+    # Read global configs
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    print(
+        f"subject-level ROI signal extraction CODE started running ... for subject: {participant_id} ..."
     )
-    num_time_mri = time_series.n_time
-    ################################# EXTRACT TASK LABELS #########################
-    oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
-    if task == "task-restingstate":
-        events = []
-        event_types = ["rest"]
-        event_labels = np.zeros((int(num_time_mri * oversampling), 1))
-        task_labels = np.zeros((int(num_time_mri * oversampling), 1))
-        Fs_task = float(1 / TR_mri) * oversampling
+
+    TASKS = dataset_info["TASKS"]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
     else:
-        task_events_root = f"{main_root}/{subj}/func/"
-        ALL_EVENTS_FILES = os.listdir(task_events_root)
-        ALL_EVENTS_FILES = [
-            i
-            for i in ALL_EVENTS_FILES
-            if (subj in i) and (task in i) and ("events.tsv" in i)
-        ]
-        if not len(ALL_EVENTS_FILES) == 1:
-            # if the events file is not found, exclude the subject
-            print("Events file not found for " + subj + " " + task)
-            continue
-        # load the tsv events file
-        events_file = task_events_root + ALL_EVENTS_FILES[0]
-        events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
-        # get the task labels
-        event_types = ["rest"] + list(np.unique(events[1:, 2]))
-        event_labels, Fs_task = task_utils.events_time_to_labels(
-            events=events,
-            TR_mri=TR_mri,
-            num_time_mri=num_time_mri,
-            event_types=event_types,
-            oversampling=oversampling,
-            return_0_1=False,
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["fmriprep_root"]:
+        fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
+    else:
+        fmriprep_root = dataset_info["fmriprep_root"]
+
+    if "{main_root}" in dataset_info["output_root"]:
+        output_root = dataset_info["output_root"].replace("{main_root}", main_root)
+    else:
+        output_root = dataset_info["output_root"]
+
+    for task in TASKS:
+        run_roi_signal_extraction(
+            subj=participant_id,
+            task=task,
+            main_root=main_root,
+            fmriprep_root=fmriprep_root,
+            bold_suffix=dataset_info["bold_suffix"],
+            output_root=output_root,
         )
-        # fill task labels with 0 (rest) and k (task's index)
-        task_labels = np.multiply(event_labels != 0, TASKS.index(task))
-    ################################# SAVE #################################
-    # save the ROI time series and task data
-    task_data = {
-        "task": task,
-        "task_labels": task_labels,
-        "task_types": TASKS,
-        "event_labels": event_labels,
-        "event_types": event_types,
-        "events": events,
-        "Fs_task": Fs_task,
-        "TR_mri": TR_mri,
-        "num_time_mri": num_time_mri,
-    }
-    subj_folder = f"{subj}_{task}"
-    if not os.path.exists(f"{output_root}/{subj_folder}/"):
-        os.makedirs(f"{output_root}/{subj_folder}/")
-    np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
-    np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
-
-print(
-    f"subject-level ROI signal extraction CODE finished running ... for subject: {subj} ..."
-)
+
+    print(
+        f"subject-level ROI signal extraction CODE finished running ... for subject: {participant_id} ..."
+    )
+
 ####################################################################

From cd48fdfbd787e6f1b577588edb0e646c9e5b4c32 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 26 Apr 2024 15:41:47 -0400
Subject: [PATCH 010/401] rearrange nifti_to_roi

---
 task_dFC/nifti_to_roi_signal.py | 304 ++++++++++++++++++++------------
 1 file changed, 188 insertions(+), 116 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 1e52cb8..0e20700 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import os
 import warnings
@@ -6,130 +7,201 @@
 
 from pydfc import data_loader, task_utils
 
-warnings.simplefilter("ignore")
-
-################################# Parameters #################################
-# data paths
-# main_root = '../../DATA/ds002785' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785"  # for server
-fmriprep_root = f"{main_root}/derivatives/fmriprep"
-output_root = f"{main_root}/derivatives/ROI_timeseries"
-
-bold_suffix = "_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz"
-
-# for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
-
-# find all subjects
-ALL_SUBJs = os.listdir(fmriprep_root)
-ALL_SUBJs = [i for i in ALL_SUBJs if ("sub-" in i) and (not ".html" in i)]
-ALL_SUBJs.sort()
-
-# pick the subject
-job_id = int(os.getenv("SGE_TASK_ID"))
-subj = ALL_SUBJs[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
-
-print(
-    f"subject-level ROI signal extraction CODE started running ... for subject: {subj} ..."
-)
-################################# FIND THE FUNC FILE #################################
-for task in TASKS:
+# warnings.simplefilter("ignore")
+
+
+################################# FUNCTIONS #################################
+def run_roi_signal_extraction(
+    subj, task, main_root, fmriprep_root, bold_suffix, output_root
+):
+    """
+    Extract ROI signals and task labels for a given subject and task
+    """
     # find the func file for this subject and task
     ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
     ALL_TASK_FILES = [
-        i for i in ALL_TASK_FILES if (bold_suffix in i) and (task in i)
+        file_i
+        for file_i in ALL_TASK_FILES
+        if (bold_suffix in file_i) and (task in file_i)
     ]  # only keep the denoised files? or use the original files?
-    # print(ALL_TASK_FILES)
-    if not len(ALL_TASK_FILES) == 1:
+
+    if not len(ALL_TASK_FILES) >= 1:
         # if the func file is not found, exclude the subject
         print("Func file not found for " + subj + " " + task)
-        continue
-    fmriprep_file = f"{fmriprep_root}/{subj}/func/{ALL_TASK_FILES[0]}"
-    info_file = (
-        f"{main_root}/{subj}/func/{ALL_TASK_FILES[0].replace(bold_suffix, '_bold.json')}"
-    )
+        return
+
+    # there might be multiple runs for the same task
+    # check if "_run" exists in all the task file names
+    if all(["_run" in task_file for task_file in ALL_TASK_FILES]):
+        multi_run_flag = True
+        # find all the runs
+        RUNS = [
+            task_file[
+                task_file.find("_run")
+                + 1 : task_file.find("_run")
+                + 1
+                + task_file[task_file.find("_run") + 1 :].find("_")
+            ]
+            for task_file in ALL_TASK_FILES
+        ]
+        # sort
+        RUNS.sort()
+        print(f"Found multiple runs for {subj} {task}: {RUNS}")
+    else:
+        multi_run_flag = False
+        RUNS = [""]
+
+    for run in RUNS:
+        task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0]
+        nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}"
+        info_file = (
+            f"{main_root}/bids/{subj}/func/{task_file.replace(bold_suffix, '_bold.json')}"
+        )
+
+        ################################# LOAD JSON INFO #########################
+        # Opening JSON file as a dictionary
+        f = open(info_file)
+        acquisition_data = json.load(f)
+        f.close()
+        TR_mri = acquisition_data["RepetitionTime"]
+        ################################# EXTRACT TIME SERIES #########################
+        # extract ROI signals and convert to TIME_SERIES object
+        time_series = data_loader.nifti2timeseries(
+            nifti_file=nifti_file,
+            n_rois=100,
+            Fs=1 / TR_mri,
+            subj_id=subj,
+            confound_strategy="no_motion",
+            standardize="zscore",
+            TS_name="BOLD",
+            session=task,
+        )
+        num_time_mri = time_series.n_time
+        ################################# EXTRACT TASK LABELS #########################
+        oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
+        if task == "task-restingstate":
+            events = []
+            event_types = ["rest"]
+            event_labels = np.zeros((int(num_time_mri * oversampling), 1))
+            task_labels = np.zeros((int(num_time_mri * oversampling), 1))
+            Fs_task = float(1 / TR_mri) * oversampling
+        else:
+            task_events_root = f"{main_root}/bids/{subj}/func"
+            ALL_EVENTS_FILES = os.listdir(task_events_root)
+            ALL_EVENTS_FILES = [
+                file_i
+                for file_i in ALL_EVENTS_FILES
+                if (subj in file_i)
+                and (task in file_i)
+                and (run in file_i)
+                and ("events.tsv" in file_i)
+            ]
+            if not len(ALL_EVENTS_FILES) == 1:
+                # if the events file is not found, exclude the subject
+                print(f"Events file not found for {subj} {task} {run}")
+                return
+            # load the tsv events file
+            events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
+            events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
+            # get the event labels
+            # check that "rest" does not already exist in the event types
+            if any(
+                ["rest" in event_type for event_type in list(np.unique(events[1:, 2]))]
+            ):
+                raise ValueError("Event types should not include 'rest'")
+            event_types = ["rest"] + list(np.unique(events[1:, 2]))
+            event_labels, Fs_task = task_utils.events_time_to_labels(
+                events=events,
+                TR_mri=TR_mri,
+                num_time_mri=num_time_mri,
+                event_types=event_types,
+                oversampling=oversampling,
+                return_0_1=False,
+            )
+            # fill task labels with task's index
+            task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(
+                task
+            )
+        ################################# SAVE #################################
+        # save the ROI time series and task data
+        task_data = {
+            "task": task,
+            "task_labels": task_labels,
+            "task_types": TASKS,
+            "event_labels": event_labels,
+            "event_types": event_types,
+            "events": events,
+            "Fs_task": Fs_task,
+            "TR_mri": TR_mri,
+            "num_time_mri": num_time_mri,
+        }
+        if multi_run_flag:
+            output_file_prefix = f"{subj}_{task}_{run}"
+        else:
+            output_file_prefix = f"{subj}_{task}"
+        if not os.path.exists(f"{output_root}/{subj}/"):
+            os.makedirs(f"{output_root}/{subj}/")
+        np.save(f"{output_root}/{subj}/{output_file_prefix}_time-series.npy", time_series)
+        np.save(f"{output_root}/{subj}/{output_file_prefix}_task-data.npy", task_data)
+
+
+########################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to convert nifti files to ROI signals for a given participant.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+    parser.add_argument("--participant_id", type=str, help="participant id")
 
-    ################################# LOAD JSON INFO #########################
-    # Opening JSON file as a dictionary
-    f = open(info_file)
-    acquisition_data = json.load(f)
-    f.close()
-    TR_mri = acquisition_data["RepetitionTime"]
-    ################################# EXTRACT TIME SERIES #########################
-    # extract ROI signals and convert to TIME_SERIES object
-    time_series = data_loader.nifti2timeseries(
-        nifti_file=fmriprep_file,
-        n_rois=100,
-        Fs=1 / TR_mri,
-        subj_id=subj,
-        confound_strategy="no_motion",
-        standardize="zscore",
-        TS_name="BOLD",
-        session=task,
+    args = parser.parse_args()
+
+    dataset_info_file = args.dataset_info
+    participant_id = args.participant_id
+
+    # Read global configs
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    print(
+        f"subject-level ROI signal extraction CODE started running ... for subject: {participant_id} ..."
     )
-    num_time_mri = time_series.n_time
-    ################################# EXTRACT TASK LABELS #########################
-    oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
-    if task == "task-restingstate":
-        events = []
-        event_types = ["rest"]
-        event_labels = np.zeros((int(num_time_mri * oversampling), 1))
-        task_labels = np.zeros((int(num_time_mri * oversampling), 1))
-        Fs_task = float(1 / TR_mri) * oversampling
+
+    TASKS = dataset_info["TASKS"]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
     else:
-        task_events_root = f"{main_root}/{subj}/func/"
-        ALL_EVENTS_FILES = os.listdir(task_events_root)
-        ALL_EVENTS_FILES = [
-            i
-            for i in ALL_EVENTS_FILES
-            if (subj in i) and (task in i) and ("events.tsv" in i)
-        ]
-        if not len(ALL_EVENTS_FILES) == 1:
-            # if the events file is not found, exclude the subject
-            print("Events file not found for " + subj + " " + task)
-            continue
-        # load the tsv events file
-        events_file = task_events_root + ALL_EVENTS_FILES[0]
-        events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
-        # get the task labels
-        event_types = ["rest"] + list(np.unique(events[1:, 2]))
-        event_labels, Fs_task = task_utils.events_time_to_labels(
-            events=events,
-            TR_mri=TR_mri,
-            num_time_mri=num_time_mri,
-            event_types=event_types,
-            oversampling=oversampling,
-            return_0_1=False,
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["fmriprep_root"]:
+        fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
+    else:
+        fmriprep_root = dataset_info["fmriprep_root"]
+
+    if "{main_root}" in dataset_info["output_root"]:
+        output_root = dataset_info["output_root"].replace("{main_root}", main_root)
+    else:
+        output_root = dataset_info["output_root"]
+
+    for task in TASKS:
+        run_roi_signal_extraction(
+            subj=participant_id,
+            task=task,
+            main_root=main_root,
+            fmriprep_root=fmriprep_root,
+            bold_suffix=dataset_info["bold_suffix"],
+            output_root=output_root,
         )
-        # fill task labels with 0 (rest) and k (task's index)
-        task_labels = np.multiply(event_labels != 0, TASKS.index(task))
-    ################################# SAVE #################################
-    # save the ROI time series and task data
-    task_data = {
-        "task": task,
-        "task_labels": task_labels,
-        "task_types": TASKS,
-        "event_labels": event_labels,
-        "event_types": event_types,
-        "events": events,
-        "Fs_task": Fs_task,
-        "TR_mri": TR_mri,
-        "num_time_mri": num_time_mri,
-    }
-    subj_folder = f"{subj}_{task}"
-    if not os.path.exists(f"{output_root}/{subj_folder}/"):
-        os.makedirs(f"{output_root}/{subj_folder}/")
-    np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
-    np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
-
-print(
-    f"subject-level ROI signal extraction CODE finished running ... for subject: {subj} ..."
-)
+
+    print(
+        f"subject-level ROI signal extraction CODE finished running ... for subject: {participant_id} ..."
+    )
+
 ####################################################################

From 8c6a5166099b2918df0e89efda142c67180c8e6b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 26 Apr 2024 15:45:03 -0400
Subject: [PATCH 011/401] update output save

---
 simul_dFC/task_data_simulator.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index f6bc3a9..d4d5cd2 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -143,11 +143,11 @@
     time_series, task_data = simul_utils.simulate_task(subj_id, all_task_info[task])
 
     # save the time series and task data
-    subj_folder = f"{subj_id}_{task}"
-    if not os.path.exists(f"{output_root}/{subj_folder}/"):
-        os.makedirs(f"{output_root}/{subj_folder}/")
-    np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
-    np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
+    output_file_prefix = f"{subj_id}_{task}"
+    if not os.path.exists(f"{output_root}/{subj_id}/"):
+        os.makedirs(f"{output_root}/{subj_id}/")
+    np.save(f"{output_root}/{subj_id}/{output_file_prefix}_time-series.npy", time_series)
+    np.save(f"{output_root}/{subj_id}/{output_file_prefix}_task-data.npy", task_data)
 
 print("****************** DONE ******************")
 ####################################################################################

From b1ec4b45fda8bc4545d26531dcac703f5fb8ecce Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 26 Apr 2024 15:46:28 -0400
Subject: [PATCH 012/401] fix simul_utils

---
 pydfc/simul_utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index 109aac2..51bf728 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -6,11 +6,7 @@
 @author: Mohammad Torabi
 """
 
-import re
-from calendar import c
-
 import numpy as np
-from matplotlib.pylab import rand
 from tvb.simulator.lab import *
 
 from pydfc import TIME_SERIES, task_utils

From d0087a0e99cdc6560ff089133bf83a1ff54d7515 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 26 Apr 2024 20:36:24 -0400
Subject: [PATCH 013/401] fix bug

---
 pydfc/simul_utils.py             | 6 +++---
 simul_dFC/task_data_simulator.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index 51bf728..d716498 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -182,7 +182,7 @@ def create_simul_task_info(
         generate more samples per TR than the func data to have a
         better event_labels time resolution
     """
-    ################################# EXTRACT TASK LABELS #########################
+    ####################### EXTRACT TASK LABELS #######################
     events = []
     event_types = ["rest", "task"]
 
@@ -276,7 +276,7 @@ def simulate_task_BOLD_TS(
     return time_series, task_data
 
 
-def simulate_task(subj_id, task_info):
+def simulate_task_data(subj_id, task_info):
     """
     Simulate task-based BOLD signal for a subject.
 
@@ -309,7 +309,7 @@ def simulate_task(subj_id, task_info):
             - dt: float
                 The simulation time step.
     """
-    time_series, task_data = simulate_task(
+    time_series, task_data = simulate_task_BOLD_TS(
         subj_id=subj_id,
         task=task_info["task_name"],
         onset_time=task_info["onset_time"],
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index d4d5cd2..7823932 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -140,7 +140,7 @@
 
 for task in all_task_info:
 
-    time_series, task_data = simul_utils.simulate_task(subj_id, all_task_info[task])
+    time_series, task_data = simul_utils.simulate_task_data(subj_id, all_task_info[task])
 
     # save the time series and task data
     output_file_prefix = f"{subj_id}_{task}"

From ee39b25e5b9af3d669b199064625a4c8fa2f508a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 29 Apr 2024 13:43:49 -0400
Subject: [PATCH 014/401] update FCS_estimate

---
 simul_dFC/FCS_estimate.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py
index caf2aa0..5ccd06e 100644
--- a/simul_dFC/FCS_estimate.py
+++ b/simul_dFC/FCS_estimate.py
@@ -14,21 +14,27 @@
 
 ################################# Parameters #################################
 # data paths
-dataset = "ds000001"
+dataset = "ds000002"
 # main_root = f"./DATA/{dataset}" # for local
-main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
+main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}"  # for server
 roi_root = f"{main_root}/derivatives/ROI_timeseries"
 output_root = f"{main_root}/derivatives/fitted_MEASURES"
 
-# for consistency we use 0 for resting state
-TASKS = ["task-pulse"]
+TASKS = [
+    "task-midFreqMidRest",
+    "task-lowFreqLongRest",
+    "task-lowFreqShortRest",
+    "task-lowFreqShortTask",
+    "task-highFreqLongRest",
+    "task-highFreqShortRest",
+    "task-midFreqMidRestNoisy",
+]
 
-# job_id = int(os.getenv("SGE_TASK_ID"))
-# TASK_id = job_id-1 # SGE_TASK_ID starts from 1 not 0
-# if TASK_id >= len(TASKS):
-#     print("TASK_id out of TASKS")
-#     exit()
-TASK_id = 0
+job_id = int(os.getenv("SGE_TASK_ID"))
+TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
+if TASK_id >= len(TASKS):
+    print("TASK_id out of TASKS")
+    exit()
 task = TASKS[TASK_id]
 
 ###### MEASUREMENT PARAMETERS ######

From 435186469f9fbbeb8e06a0f0aee22c5edd1b7309 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 29 Apr 2024 21:52:34 -0400
Subject: [PATCH 015/401] update FCS_estimate

---
 simul_dFC/FCS_estimate.py       |  26 +++----
 task_dFC/FCS_estimate.py        | 126 +++++++++++++++++---------------
 task_dFC/nifti_to_roi_signal.py |   9 ++-
 3 files changed, 86 insertions(+), 75 deletions(-)

diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py
index 5ccd06e..0fd7653 100644
--- a/simul_dFC/FCS_estimate.py
+++ b/simul_dFC/FCS_estimate.py
@@ -65,8 +65,8 @@
     "session": task,
     # Hyper Parameters
     "normalization": True,
-    "num_subj": None,  # None or 200?
-    "num_time_point": None,  # None or set?
+    "num_subj": None,
+    "num_time_point": None,
 }
 
 ###### HYPER PARAMETERS ALTERNATIVE ######
@@ -106,15 +106,12 @@
 ################################# LOAD DATA #################################
 
 BOLD = data_loader.load_TS(
-    data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None
+    data_root=roi_root,
+    file_name="{subj_id}_{task}_time-series.npy",
+    SESSIONs=task,
+    subj_id2load=None,
+    task=task,
 )
-
-################################# Visualize BOLD #################################
-
-# for session in BOLD:
-#     BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)),
-#         save_image=False, output_root=None)
-
 ################################ Measures of dFC #################################
 
 MA = MultiAnalysis(
@@ -136,15 +133,14 @@
     if measure.is_state_based:
         measure.estimate_FCS(time_series=BOLD)
 
-    # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD)
     print("FCS estimation done.")
 
     # Save
-    if not os.path.exists(f"{output_root}/{task}"):
-        os.makedirs(f"{output_root}/{task}")
-    np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure)
+    if not os.path.exists(f"{output_root}"):
+        os.makedirs(f"{output_root}")
+    np.save(f"{output_root}/MEASURE_{task}_{MEASURE_id}.npy", measure)
 
 print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-np.save(f"{output_root}/{task}/multi_analysis.npy", MA)
+np.save(f"{output_root}/multi-analysis_{task}.npy", MA)
 
 #################################################################################
diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index de4d738..d171085 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -14,20 +14,21 @@
 
 ################################# Parameters #################################
 # data paths
-# main_root = '../../DATA/ds002785/' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785"  # for server
+dataset = "ds003242"
+# main_root = f"../../DATA/{dataset}" # for local
+main_root = f"/data/origami/dFC/DATA/task-based/openneuro/{dataset}"  # for server
 roi_root = f"{main_root}/derivatives/ROI_timeseries"
 output_root = f"{main_root}/derivatives/fitted_MEASURES"
 
 # for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
+TASKS = ["task-CIC", "task-midloc"]
+
+# default RUNS = None
+RUNS = None
+RUNS = {
+    "task-CIC": ["run-001", "run-002", "run-003", "run-004", "run-005", "run-006"],
+    "task-midloc": ["run-001"],
+}
 
 job_id = int(os.getenv("SGE_TASK_ID"))
 TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
@@ -42,7 +43,7 @@
 
 params_methods = {
     # Sliding Parameters
-    "W": 44,
+    "W": 12,
     "n_overlap": 1.0,
     "sw_method": "pear_corr",
     "tapered_window": True,
@@ -54,8 +55,8 @@
     "hmm_iter": 20,
     "dhmm_obs_state_ratio": 16 / 24,
     # State Parameters
-    "n_states": 12,
-    "n_subj_clstrs": 20,
+    "n_states": 5,
+    "n_subj_clstrs": 10,
     # Parallelization Parameters
     "n_jobs": 2,
     "verbose": 0,
@@ -64,8 +65,8 @@
     "session": task,
     # Hyper Parameters
     "normalization": True,
-    "num_subj": None,  # None or 216?
-    "num_time_point": None,  # None or set?
+    "num_subj": None,
+    "num_time_point": None,
 }
 
 ###### HYPER PARAMETERS ALTERNATIVE ######
@@ -102,48 +103,57 @@
     "backend": "loky",
 }
 
-################################# LOAD DATA #################################
-
-BOLD = data_loader.load_TS(
-    data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None
-)
-
-################################# Visualize BOLD #################################
-
-# for session in BOLD:
-#     BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)),
-#         save_image=False, output_root=None)
-
-################################ Measures of dFC #################################
-
-MA = MultiAnalysis(
-    analysis_name=f"task-based-dFC-ds002785-{task}", **params_multi_analysis
-)
-
-MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams)
-
-tic = time.time()
-print("Measurement Started ...")
-
-################################# estimate FCS #################################
-
-for MEASURE_id, measure in enumerate(MEASURES_lst):
-
-    print("MEASURE: " + measure.measure_name)
-    print("FCS estimation started...")
-
-    if measure.is_state_based:
-        measure.estimate_FCS(time_series=BOLD)
-
-    # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD)
-    print("FCS estimation done.")
-
-    # Save
-    if not os.path.exists(f"{output_root}/{task}"):
-        os.makedirs(f"{output_root}/{task}")
-    np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure)
-
-print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-np.save(f"{output_root}/{task}/multi_analysis.npy", MA)
+if RUNS is None:
+    RUNS = {task: [None]}
+for run in RUNS[task]:
+    if run is None:
+        print(f"TASK: {task} started ...")
+        file_suffix = f"{task}"
+        BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+    else:
+        print(f"TASK: {task}, RUN: {run} started ...")
+        file_suffix = f"{task}_{run}"
+        BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+    ################################# LOAD DATA #################################
+    BOLD = data_loader.load_TS(
+        data_root=roi_root,
+        file_name=BOLD_file_name,
+        SESSIONs=task,
+        subj_id2load=None,
+        task=task,
+        run=run,
+    )
+    ################################ Measures of dFC #################################
+
+    MA = MultiAnalysis(
+        analysis_name=f"task-based-dFC-{dataset}-{file_suffix}", **params_multi_analysis
+    )
+
+    MEASURES_lst = MA.measures_initializer(
+        MEASURES_name_lst, params_methods, alter_hparams
+    )
+
+    tic = time.time()
+    print("Measurement Started ...")
+
+    ################################# estimate FCS #################################
+
+    for MEASURE_id, measure in enumerate(MEASURES_lst):
+
+        print("MEASURE: " + measure.measure_name)
+        print("FCS estimation started...")
+
+        if measure.is_state_based:
+            measure.estimate_FCS(time_series=BOLD)
+
+        print("FCS estimation done.")
+
+        # Save
+        if not os.path.exists(f"{output_root}"):
+            os.makedirs(f"{output_root}")
+        np.save(f"{output_root}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
+
+    print(f"Measurement required {time.time() - tic:0.3f} seconds.")
+    np.save(f"{output_root}/multi-analysis_{file_suffix}.npy", MA)
 
 #################################################################################
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 0e20700..59c8792 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -18,7 +18,12 @@ def run_roi_signal_extraction(
     Extract ROI signals and task labels for a given subject and task
     """
     # find the func file for this subject and task
-    ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
+    try:
+        ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
+    except FileNotFoundError:
+        print(f"Subject {subj} not found in {fmriprep_root}")
+        return
+
     ALL_TASK_FILES = [
         file_i
         for file_i in ALL_TASK_FILES
@@ -27,7 +32,7 @@ def run_roi_signal_extraction(
 
     if not len(ALL_TASK_FILES) >= 1:
         # if the func file is not found, exclude the subject
-        print("Func file not found for " + subj + " " + task)
+        print(f"Func file not found for {subj} {task}")
         return
 
     # there might be multiple runs for the same task

From 0c6158ff032465e3db54a8b6d49c14e62a8ea8af Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 1 May 2024 17:53:19 -0400
Subject: [PATCH 016/401] reorganize dFC_assess

---
 task_dFC/dFC_assessment.py      | 257 ++++++++++++++++++++++----------
 task_dFC/nifti_to_roi_signal.py |   6 +-
 2 files changed, 179 insertions(+), 84 deletions(-)

diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index a381f95..84564c4 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -1,3 +1,5 @@
+import argparse
+import json
 import os
 import time
 import warnings
@@ -12,97 +14,190 @@
 os.environ["NUMEXPR_NUM_THREADS"] = "16"
 os.environ["OMP_NUM_THREADS"] = "16"
 
-################################# Parameters #################################
-
-# Data parameters
-# main_root = '../../DATA/ds002785/' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785/"  # for server
-
-# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate
-# you can set the new roi root and data load parameters here:
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES"
-output_root = f"{main_root}/derivatives/dFC_assessed"
-
-# for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
-
-# find all subjects across all tasks
-SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
-
-# job_id selects the subject
-job_id = int(os.getenv("SGE_TASK_ID"))
-if job_id > len(SUBJECTS):
-    print("job_id > len(SUBJECTS)")
-    exit()
-subj_id = SUBJECTS[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
-
-for task in TASKS:
-
-    MA = np.load(
-        f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE"
-    ).item()
-
-    # check if the subject has this task
-    SUBJECTS_with_this_task = data_loader.find_subj_list(
-        data_root=roi_root, sessions=[task]
-    )
-    if not subj_id in SUBJECTS_with_this_task:
-        print(f"subject {subj_id} not in the list of subjects with task {task}")
-        continue
-
-    ################################# LOAD FIT MEASURES #################################
-
-    ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/")
-    ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i]
-    ALL_RECORDS.sort()
-    MEASURES_fit_lst = list()
-    for s in ALL_RECORDS:
-        fit_measure = np.load(
-            f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE"
+################################# Functions #################################
+
+
+def run_dFC_assess(
+    subj_id,
+    task,
+    roi_root,
+    fitted_measures_root,
+    output_root,
+):
+
+    # check if the subject has this task in roi_root
+    if not os.path.exists(f"{roi_root}/{subj_id}"):
+        print(f"Subject {subj_id} not found in {roi_root}")
+        return
+
+    ALL_ROI_FILES = os.listdir(f"{roi_root}/{subj_id}/")
+    ALL_ROI_FILES = [
+        roi_file
+        for roi_file in ALL_ROI_FILES
+        if ("_time-series.npy" in roi_file) and (task in roi_file)
+    ]
+    ALL_ROI_FILES.sort()
+
+    # check if "_run" exists in all the task file names
+    if all(["_run" in roi_file for roi_file in ALL_ROI_FILES]):
+        # find all the runs
+        RUNS = [
+            roi_file[
+                roi_file.find("_run")
+                + 1 : roi_file.find("_run")
+                + 1
+                + roi_file[roi_file.find("_run") + 1 :].find("_")
+            ]
+            for roi_file in ALL_ROI_FILES
+        ]
+        # sort
+        RUNS.sort()
+        print(f"Found multiple runs for {subj_id} {task}: {RUNS}")
+    else:
+        RUNS = [None]
+
+    for run in RUNS:
+
+        # check if the subject has this task and run in roi_root
+        if run is None:
+            file_suffix = f"{task}"
+            if not os.path.exists(
+                f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy"
+            ):
+                print(f"Time series file not found for {subj_id} {task}")
+                continue
+            else:
+                print(
+                    f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..."
+                )
+                BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+        else:
+            file_suffix = f"{task}_{run}"
+            if not os.path.exists(
+                f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy"
+            ):
+                print(f"Time series file not found for {subj_id} {task} {run}")
+                continue
+            else:
+                print(
+                    f"subject-level dFC assessment CODE started running ... for task {task} and {run} of subject {subj_id} ..."
+                )
+                BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+
+        ################################# LOAD FIT MEASURES #################################
+
+        MA = np.load(
+            f"{fitted_measures_root}/multi-analysis_{file_suffix}.npy",
+            allow_pickle="TRUE",
         ).item()
-        MEASURES_fit_lst.append(fit_measure)
-    MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
-    print("fitted MEASURES loaded ...")
 
-    ################################# LOAD DATA #################################
+        ALL_RECORDS = os.listdir(f"{fitted_measures_root}/")
+        ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)]
+        ALL_RECORDS.sort()
+        MEASURES_fit_lst = list()
+        for s in ALL_RECORDS:
+            fit_measure = np.load(
+                f"{fitted_measures_root}/{s}", allow_pickle="TRUE"
+            ).item()
+            MEASURES_fit_lst.append(fit_measure)
+        MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
+        print("fitted MEASURES are loaded ...")
 
-    print(
-        f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..."
-    )
+        ################################# LOAD DATA #################################
 
-    BOLD = data_loader.load_TS(
-        data_root=roi_root,
-        file_name="time_series.npy",
-        SESSIONs=[task],
-        subj_id2load=subj_id,
-    )
+        BOLD = data_loader.load_TS(
+            data_root=roi_root,
+            file_name=BOLD_file_name,
+            SESSIONs=task,
+            subj_id2load=subj_id,
+            task=task,
+            run=run,
+        )
+
+        ################################# dFC ASSESSMENT #################################
+
+        tic = time.time()
+        print("Measurement Started ...")
+
+        print("dFC estimation started...")
+        dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD)
+        print("dFC estimation done.")
+
+        print(f"Measurement required {time.time() - tic:0.3f} seconds.")
+
+        ################################# SAVE DATA #################################
+
+        folder = f"{output_root}/{subj_id}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
 
-    ################################# dFC ASSESSMENT #################################
+        for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
+            np.save(f"{folder}/dFC_{file_suffix}_{dFC_id}.npy", dFC)
 
-    tic = time.time()
-    print("Measurement Started ...")
 
-    print("dFC estimation started...")
-    dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD)
-    print("dFC estimation done.")
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to assess dFC for a given participant.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+    parser.add_argument("--participant_id", type=str, help="participant id")
+
+    args = parser.parse_args()
 
-    print(f"Measurement required {time.time() - tic:0.3f} seconds.")
+    dataset_info_file = args.dataset_info
+    participant_id = args.participant_id
 
-    ################################# SAVE DATA #################################
+    # Read global configs
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
 
-    folder = f"{output_root}/{task}/{subj_id}"
-    if not os.path.exists(folder):
-        os.makedirs(folder)
+    print(
+        f"subject-level dFC assessment CODE started running ... for subject: {participant_id} ..."
+    )
+
+    TASKS = dataset_info["TASKS"]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["roi_root"]:
+        roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
+    else:
+        roi_root = dataset_info["roi_root"]
+
+    if "{main_root}" in dataset_info["fitted_measures_root"]:
+        fitted_measures_root = dataset_info["fitted_measures_root"].replace(
+            "{main_root}", main_root
+        )
+    else:
+        fitted_measures_root = dataset_info["fitted_measures_root"]
+
+    if "{main_root}" in dataset_info["dFC_root"]:
+        output_root = dataset_info["dFC_root"].replace("{main_root}", main_root)
+    else:
+        output_root = dataset_info["dFC_root"]
+
+    for task in TASKS:
+        run_dFC_assess(
+            subj_id=participant_id,
+            task=task,
+            roi_root=roi_root,
+            fitted_measures_root=fitted_measures_root,
+            output_root=output_root,
+        )
 
-    for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
-        np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC)
+    print(
+        f"subject-level dFC assessment CODE finished running ... for subject: {participant_id} ..."
+    )
 
 #######################################################################################
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 59c8792..7ee8870 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -190,10 +190,10 @@ def run_roi_signal_extraction(
     else:
         fmriprep_root = dataset_info["fmriprep_root"]
 
-    if "{main_root}" in dataset_info["output_root"]:
-        output_root = dataset_info["output_root"].replace("{main_root}", main_root)
+    if "{main_root}" in dataset_info["roi_root"]:
+        output_root = dataset_info["roi_root"].replace("{main_root}", main_root)
     else:
-        output_root = dataset_info["output_root"]
+        output_root = dataset_info["roi_root"]
 
     for task in TASKS:
         run_roi_signal_extraction(

From 8a1b0ba7a69665566f36451e8365851f5151ba08 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 17 May 2024 23:05:05 -0400
Subject: [PATCH 017/401] Change dFC_assessment

---
 task_dFC/dFC_assessment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index 84564c4..cb8993f 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -120,7 +120,7 @@ def run_dFC_assess(
         print("Measurement Started ...")
 
         print("dFC estimation started...")
-        dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD)
+        dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD)
         print("dFC estimation done.")
 
         print(f"Measurement required {time.time() - tic:0.3f} seconds.")

From 0e6e1b79b7d4d9e2d04d7f6e620784297b5b33ce Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 18 May 2024 11:20:40 -0400
Subject: [PATCH 018/401] minor fix

---
 task_dFC/dFC_assessment.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index cb8993f..b3068b0 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -38,6 +38,11 @@ def run_dFC_assess(
     ]
     ALL_ROI_FILES.sort()
 
+    # if there are no files for this task, return
+    if not len(ALL_ROI_FILES) >= 1:
+        print(f"No time series files found for {subj_id} {task}")
+        return
+
     # check if "_run" exists in all the task file names
     if all(["_run" in roi_file for roi_file in ALL_ROI_FILES]):
         # find all the runs

From f38d22a161d4f3f7f92a68055673ed614b681d4b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 26 May 2024 16:26:56 -0400
Subject: [PATCH 019/401] update ML.py

---
 simul_dFC/KNN_ML.py | 640 ++++++++++++++++++++++++++++----------------
 task_dFC/ML.py      | 461 +++++++++++++++++++++++++++++++
 2 files changed, 875 insertions(+), 226 deletions(-)
 create mode 100644 task_dFC/ML.py

diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py
index 44eca0a..c1b60cc 100644
--- a/simul_dFC/KNN_ML.py
+++ b/simul_dFC/KNN_ML.py
@@ -1,9 +1,11 @@
+import argparse
+import json
 import os
 
 import numpy as np
 from sklearn.decomposition import PCA
 from sklearn.metrics import balanced_accuracy_score
-from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -11,128 +13,54 @@
 from pydfc import DFC, data_loader, task_utils
 from pydfc.dfc_utils import dFC_mat2vec, rank_norm
 
-# Data parameters
-dataset = "ds000001"
-
-# main_root = f"./DATA/{dataset}" # for local
-main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-dFC_root = f"{main_root}/derivatives/dFC_assessed"
-output_root = "./ML_RESULTS_KNN_classify"
-
-TASKS = ["task-pulse"]
-
-dynamic_pred = "no"  # 'past' or 'past_and_future' or 'no' (only current TR)
-normalize_dFC = True
-
-SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
-
-# randomly select 80% of the subjects for training and 20% for testing using numpy.random.choice
-train_subjects = np.random.choice(SUBJECTS, int(0.8 * len(SUBJECTS)), replace=False)
-test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
-
-print(
-    f"number of train_subjects: {len(train_subjects)} and test_subjects: {len(test_subjects)}"
-)
-
-
-################## TASK FEATURES ##################
-
-task_features = {
-    "task": list(),
-    "relative_task_on": list(),
-    "avg_task_duration": list(),
-    "var_task_duration": list(),
-    "avg_rest_duration": list(),
-    "var_rest_duration": list(),
-    "num_of_transitions": list(),
-    "relative_transition_freq": list(),
-}
-for task_id, task in enumerate(TASKS):
-
-    if task == "task-restingstate":
-        continue
-
-    for subj in SUBJECTS:
-        # event data
-        task_data = np.load(
-            f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE"
-        ).item()
-        Fs_task = task_data["Fs_task"]
-        TR_task = 1 / Fs_task
-
-        task_presence = task_utils.extract_task_presence(
-            event_labels=task_data["event_labels"],
-            TR_task=TR_task,
-            TR_mri=task_data["TR_mri"],
-            binary=True,
-        )
-
-        relative_task_on = task_utils.relative_task_on(task_presence)
-        # task duration
-        avg_task_duration, var_task_duration = task_utils.task_duration(
-            task_presence, task_data["TR_mri"]
-        )
-        # rest duration
-        avg_rest_duration, var_rest_duration = task_utils.rest_duration(
-            task_presence, task_data["TR_mri"]
-        )
-        # freq of transitions
-        num_of_transitions, relative_transition_freq = task_utils.transition_freq(
-            task_presence
-        )
-
-        task_features["task"].append(task)
-        task_features["relative_task_on"].append(relative_task_on)
-        task_features["avg_task_duration"].append(avg_task_duration)
-        task_features["var_task_duration"].append(var_task_duration)
-        task_features["avg_rest_duration"].append(avg_rest_duration)
-        task_features["var_rest_duration"].append(var_rest_duration)
-        task_features["num_of_transitions"].append(num_of_transitions)
-        task_features["relative_transition_freq"].append(relative_transition_freq)
-
-
-################## TASK PRESENCE CLASSIFICATION ##################
-ML_scores = {
-    "subj_id": list(),
-    "group": list(),
-    "task": list(),
-    "dFC method": list(),
-    "KNN accuracy": list(),
-}
-for dFC_id in range(0, 7):
-    print(f"=================== dFC {dFC_id} ===================")
-
-    ML_RESULT = {}
+#######################################################################################
+
+
+def find_available_subjects(dFC_root, task, dFC_id=None):
+    """
+    Find the subjects that have dFC results for the given task and dFC_id (method).
+    """
+    SUBJECTS = list()
+    ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
+    ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder]
+    ALL_SUBJ_FOLDERS.sort()
+    for subj_folder in ALL_SUBJ_FOLDERS:
+        ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
+        ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file]
+        if dFC_id is not None:
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
+            ]
+        ALL_DFC_FILES.sort()
+        if len(ALL_DFC_FILES) > 0:
+            SUBJECTS.append(subj_folder)
+    return SUBJECTS
+
+
+def extract_task_features(TASKS, roi_root, output_root):
+    """
+    Extract task features from the event data."""
+    task_features = {
+        "task": list(),
+        "relative_task_on": list(),
+        "avg_task_duration": list(),
+        "var_task_duration": list(),
+        "avg_rest_duration": list(),
+        "var_rest_duration": list(),
+        "num_of_transitions": list(),
+        "relative_transition_freq": list(),
+    }
     for task_id, task in enumerate(TASKS):
-        print(f"=============== {task} ===============")
 
         if task == "task-restingstate":
             continue
 
-        X_train = None
-        X_test = None
-        y_condition_train = None
-        y_condition_test = None
-        subj_label_train = list()
-        subj_label_test = list()
+        SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task)
 
         for subj in SUBJECTS:
-
-            dFC = np.load(
-                f"{dFC_root}/{task}/{subj}/dFC_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-
-            dFC_mat = dFC.get_dFC_mat()
-            TR_array = dFC.TR_array
-            if normalize_dFC:
-                dFC_mat = rank_norm(dFC_mat)
-
-            dFC_vecs = dFC_mat2vec(dFC_mat)
-
             # event data
             task_data = np.load(
-                f"{roi_root}/{subj}_{task}/task_data.npy", allow_pickle="TRUE"
+                f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
             ).item()
             Fs_task = task_data["Fs_task"]
             TR_task = 1 / Fs_task
@@ -141,132 +69,392 @@
                 event_labels=task_data["event_labels"],
                 TR_task=TR_task,
                 TR_mri=task_data["TR_mri"],
-                TR_array=TR_array,
                 binary=True,
             )
 
-            X_new = dFC_vecs
-            y_new = task_presence.ravel()
-
-            if dynamic_pred == "past":
-                # concat current TR and two TR before of X_new to predict the current TR of y_new
-                # ignore the edge case of the first two TRs
-                X_new = np.concatenate(
-                    (X_new, np.roll(X_new, 1, axis=0), np.roll(X_new, 2, axis=0)), axis=1
-                )
-                X_new = X_new[2:, :]
-                y_new = y_new[2:]
-
-            elif dynamic_pred == "past_and_future":
-                # concat current TR and two TR before and after of X_new to predict the current TR of y_new
-                # ignore the edge case of the first and last two TRs
-                X_new = np.concatenate(
-                    (
-                        X_new,
-                        np.roll(X_new, 1, axis=0),
-                        np.roll(X_new, 2, axis=0),
-                        np.roll(X_new, -1, axis=0),
-                        np.roll(X_new, -2, axis=0),
-                    ),
-                    axis=1,
-                )
-                X_new = X_new[2:-2, :]
-                y_new = y_new[2:-2]
-
-            if subj in train_subjects:
-                subj_label_train.extend([subj for i in range(X_new.shape[0])])
-                if X_train is None and y_condition_train is None:
-                    X_train = X_new
-                    y_condition_train = y_new
-                else:
-                    X_train = np.concatenate((X_train, X_new), axis=0)
-                    y_condition_train = np.concatenate((y_condition_train, y_new), axis=0)
-            elif subj in test_subjects:
-                subj_label_test.extend([subj for i in range(X_new.shape[0])])
-                if X_test is None and y_condition_test is None:
-                    X_test = X_new
-                    y_condition_test = y_new
-                else:
-                    X_test = np.concatenate((X_test, X_new), axis=0)
-                    y_condition_test = np.concatenate((y_condition_test, y_new), axis=0)
-
-        print(
-            X_train.shape, X_test.shape, y_condition_train.shape, y_condition_test.shape
-        )
-        subj_label_train = np.array(subj_label_train)
-        subj_label_test = np.array(subj_label_test)
-        print(subj_label_train.shape, subj_label_test.shape)
+            relative_task_on = task_utils.relative_task_on(task_presence)
+            # task duration
+            avg_task_duration, var_task_duration = task_utils.task_duration(
+                task_presence, task_data["TR_mri"]
+            )
+            # rest duration
+            avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+                task_presence, task_data["TR_mri"]
+            )
+            # freq of transitions
+            num_of_transitions, relative_transition_freq = task_utils.transition_freq(
+                task_presence
+            )
 
-        # task presence classification
+            task_features["task"].append(task)
+            task_features["relative_task_on"].append(relative_task_on)
+            task_features["avg_task_duration"].append(avg_task_duration)
+            task_features["var_task_duration"].append(var_task_duration)
+            task_features["avg_rest_duration"].append(avg_rest_duration)
+            task_features["var_rest_duration"].append(var_rest_duration)
+            task_features["num_of_transitions"].append(num_of_transitions)
+            task_features["relative_transition_freq"].append(relative_transition_freq)
 
-        print("task presence classification ...")
+    folder = f"{output_root}"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    np.save(f"{folder}/task_features_KNN_classify.npy", task_features)
+
+
+def dFC_feature_extraction_subj_lvl(
+    dFC,
+    task_data,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    """
+    Extract features and target for task presence classification
+    for a single subject.
+    """
+    # dFC features
+    dFC_mat = dFC.get_dFC_mat()
+    TR_array = dFC.TR_array
+    if normalize_dFC:
+        dFC_mat = rank_norm(dFC_mat)
+    dFC_vecs = dFC_mat2vec(dFC_mat)
+
+    # event data
+    task_presence = task_utils.extract_task_presence(
+        event_labels=task_data["event_labels"],
+        TR_task=1 / task_data["Fs_task"],
+        TR_mri=task_data["TR_mri"],
+        TR_array=TR_array,
+        binary=True,
+    )
+
+    features = dFC_vecs
+    target = task_presence.ravel()
+
+    if dynamic_pred == "past":
+        # concat current TR and two TR before of features to predict the current TR of target
+        # ignore the edge case of the first two TRs
+        features = np.concatenate(
+            (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1
+        )
+        features = features[2:, :]
+        target = target[2:]
+    elif dynamic_pred == "past_and_future":
+        # concat current TR and two TR before and after of features to predict the current TR of target
+        # ignore the edge case of the first and last two TRs
+        features = np.concatenate(
+            (
+                features,
+                np.roll(features, 1, axis=0),
+                np.roll(features, 2, axis=0),
+                np.roll(features, -1, axis=0),
+                np.roll(features, -2, axis=0),
+            ),
+            axis=1,
+        )
+        features = features[2:-2, :]
+        target = target[2:-2]
+
+    return features, target
+
+
+def dFC_feature_extraction(
+    task,
+    train_subjects,
+    test_subjects,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    """
+    Extract features and target for task presence classification
+    for all subjects.
+    """
+    X_train = None
+    y_train = None
+    subj_label_train = list()
+    for subj in train_subjects:
+        dFC = np.load(
+            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+        ).item()
 
-        # find num_PCs
-        pca = PCA(svd_solver="full", whiten=False)
-        pca.fit(X_train)
-        num_PCs = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1
+        task_data = np.load(
+            f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+        ).item()
 
-        # create a pipeline with a knn model to find the best n_neighbors
-        knn = make_pipeline(
-            StandardScaler(),
-            PCA(n_components=num_PCs),
-            KNeighborsClassifier(),
+        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+            dFC=dFC,
+            task_data=task_data,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
         )
-        # create a dictionary of all values we want to test for n_neighbors
-        param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
-        # use gridsearch to test all values for n_neighbors
-        knn_gscv = GridSearchCV(knn, param_grid, cv=5)
-        # fit model to data
-        knn_gscv.fit(X_train, y_condition_train)
-
-        n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
-
-        neigh = make_pipeline(
-            StandardScaler(),
-            PCA(n_components=num_PCs),
-            KNeighborsClassifier(n_neighbors=n_neighbors),
-        ).fit(X_train, y_condition_train)
-
-        ML_RESULT[task] = {
-            "pca": pca,
-            "num_PCs": num_PCs,
-            "cv_results": knn_gscv.cv_results_,
-            "KNN": neigh,
-            "KNN train score": neigh.score(X_train, y_condition_train),
-            "KNN test score": neigh.score(X_test, y_condition_test),
-        }
-
-        print(
-            f"KNN train score {dFC.measure.measure_name} {task}: {neigh.score(X_train, y_condition_train)}"
+
+        subj_label_train.extend([subj for i in range(X_subj.shape[0])])
+        if X_train is None and y_train is None:
+            X_train = X_subj
+            y_train = y_subj
+        else:
+            X_train = np.concatenate((X_train, X_subj), axis=0)
+            y_train = np.concatenate((y_train, y_subj), axis=0)
+
+    X_test = None
+    y_test = None
+    subj_label_test = list()
+    for subj in test_subjects:
+        dFC = np.load(
+            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+        ).item()
+
+        task_data = np.load(
+            f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+        ).item()
+
+        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+            dFC=dFC,
+            task_data=task_data,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
         )
-        print(
-            f"KNN test score {dFC.measure.measure_name} {task}: {neigh.score(X_test, y_condition_test)}"
+
+        subj_label_test.extend([subj for i in range(X_subj.shape[0])])
+        if X_test is None and y_test is None:
+            X_test = X_subj
+            y_test = y_subj
+        else:
+            X_test = np.concatenate((X_test, X_subj), axis=0)
+            y_test = np.concatenate((y_test, y_subj), axis=0)
+
+    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
+    subj_label_train = np.array(subj_label_train)
+    subj_label_test = np.array(subj_label_test)
+
+    return (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        subj_label_train,
+        subj_label_test,
+        dFC.measure.measure_name,
+    )
+
+
+def task_presence_classification(
+    task,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    dynamic_pred="no",
+    normalize_dFC=True,
+    train_test_ratio=0.8,
+    explained_var_threshold=0.95,
+):
+    print(f"=============== {task} ===============")
+
+    if task == "task-restingstate":
+        return
+
+    SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id)
+
+    # randomly select train_test_ratio of the subjects for training
+    # and rest for testing using numpy.random.choice
+    train_subjects = np.random.choice(
+        SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False
+    )
+    test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
+    print(
+        f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
+    )
+
+    X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = (
+        dFC_feature_extraction(
+            task=task,
+            train_subjects=train_subjects,
+            test_subjects=test_subjects,
+            dFC_id=dFC_id,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
         )
+    )
+
+    # task presence classification
+
+    print("task presence classification ...")
+
+    # find num_PCs
+    pca = PCA(svd_solver="full", whiten=False)
+    pca.fit(X_train)
+    num_PCs = (
+        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
+        + 1
+    )
+
+    # create a pipeline with a knn model to find the best n_neighbors
+    knn = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        KNeighborsClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_neighbors
+    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
+    # use gridsearch to test all values for n_neighbors
+    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
+    # fit model to data
+    knn_gscv.fit(X_train, y_train)
+
+    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
+
+    neigh = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        KNeighborsClassifier(n_neighbors=n_neighbors),
+    ).fit(X_train, y_train)
+
+    ML_RESULT = {
+        "pca": pca,
+        "num_PCs": num_PCs,
+        "cv_results": knn_gscv.cv_results_,
+        "KNN": neigh,
+        "KNN train score": neigh.score(X_train, y_train),
+        "KNN test score": neigh.score(X_test, y_test),
+    }
+
+    print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}")
+    print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}")
+
+    # measure pred score on each subj
+
+    ML_scores = {
+        "subj_id": list(),
+        "group": list(),
+        "task": list(),
+        "dFC method": list(),
+        "KNN accuracy": list(),
+    }
+    for subj in SUBJECTS:
+        ML_scores["subj_id"].append(subj)
+        if subj in train_subjects:
+            ML_scores["group"].append("train")
+            features = X_train[subj_label_train == subj, :]
+            target = y_train[subj_label_train == subj]
+        elif subj in test_subjects:
+            ML_scores["group"].append("test")
+            features = X_test[subj_label_test == subj, :]
+            target = y_test[subj_label_test == subj]
+
+        pred = neigh.predict(features)
+
+        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
+
+        ML_scores["task"].append(task)
+        ML_scores["dFC method"].append(measure_name)
+
+    return ML_RESULT, ML_scores
+
+
+def run_classification(
+    TASKS,
+    roi_root,
+    dFC_root,
+    output_root,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    ML_scores = {
+        "subj_id": list(),
+        "group": list(),
+        "task": list(),
+        "dFC method": list(),
+        "KNN accuracy": list(),
+    }
+    for dFC_id in range(0, 7):
+        print(f"=================== dFC {dFC_id} ===================")
+
+        ML_RESULT = {}
+        for task_id, task in enumerate(TASKS):
+            ML_RESULT_new, ML_scores_new = task_presence_classification(
+                task=task,
+                dFC_id=dFC_id,
+                roi_root=roi_root,
+                dFC_root=dFC_root,
+                dynamic_pred=dynamic_pred,
+                normalize_dFC=normalize_dFC,
+            )
+            ML_RESULT[task] = ML_RESULT_new
+            for key in ML_scores:
+                ML_scores[key].extend(ML_scores_new[key])
 
-        # measure pred score on each subj
+        folder = f"{output_root}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
 
-        for subj in SUBJECTS:
-            ML_scores["subj_id"].append(subj)
-            if subj in train_subjects:
-                ML_scores["group"].append("train")
-                features = X_train[subj_label_train == subj, :]
-                target = y_condition_train[subj_label_train == subj]
-            elif subj in test_subjects:
-                ML_scores["group"].append("test")
-                features = X_test[subj_label_test == subj, :]
-                target = y_condition_test[subj_label_test == subj]
+    np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores)
 
-            pred = neigh.predict(features)
 
-            ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
+#######################################################################################
 
-            ML_scores["task"].append(task)
-            ML_scores["dFC method"].append(dFC.measure.measure_name)
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to apply Machine Learning on dFC results to predict task presence.
+    """
 
-    folder = f"{output_root}"
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    np.save(f"{folder}/ML_RESULT_{dFC.measure.measure_name}.npy", ML_RESULT)
+    parser = argparse.ArgumentParser(description=HELPTEXT)
 
-np.save(f"{folder}/task_features_KNN_classify.npy", task_features)
-np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores)
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+
+    args = parser.parse_args()
+
+    dataset_info_file = args.dataset_info
+
+    # Read global configs
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    print("Task presence prediction started ...")
+
+    TASKS = dataset_info["TASKS"]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["roi_root"]:
+        roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
+    else:
+        roi_root = dataset_info["roi_root"]
+
+    if "{main_root}" in dataset_info["dFC_root"]:
+        dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root)
+    else:
+        dFC_root = dataset_info["dFC_root"]
+
+    if "{main_root}" in dataset_info["ML_root"]:
+        ML_root = dataset_info["ML_root"].replace("{main_root}", main_root)
+    else:
+        ML_root = dataset_info["ML_root"]
+
+    extract_task_features(
+        TASKS=TASKS,
+        roi_root=roi_root,
+        output_root=ML_root,
+    )
+    run_classification(
+        TASKS=TASKS,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        output_root=ML_root,
+        dynamic_pred="no",
+        normalize_dFC=True,
+    )
+
+    print("Task presence prediction CODE finished running.")
+
+#######################################################################################
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
new file mode 100644
index 0000000..2e19811
--- /dev/null
+++ b/task_dFC/ML.py
@@ -0,0 +1,461 @@
+import argparse
+import json
+import os
+
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn.metrics import balanced_accuracy_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+from pydfc import DFC, data_loader, task_utils
+from pydfc.dfc_utils import dFC_mat2vec, rank_norm
+
+#######################################################################################
+
+
+def find_available_subjects(dFC_root, task, dFC_id=None):
+    """
+    Find the subjects that have dFC results for the given task and dFC_id (method).
+    """
+    SUBJECTS = list()
+    ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
+    ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder]
+    ALL_SUBJ_FOLDERS.sort()
+    for subj_folder in ALL_SUBJ_FOLDERS:
+        ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
+        ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file]
+        if dFC_id is not None:
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
+            ]
+        ALL_DFC_FILES.sort()
+        if len(ALL_DFC_FILES) > 0:
+            SUBJECTS.append(subj_folder)
+    return SUBJECTS
+
+
+def extract_task_features(TASKS, roi_root, output_root):
+    """
+    Extract task features from the event data."""
+    task_features = {
+        "task": list(),
+        "relative_task_on": list(),
+        "avg_task_duration": list(),
+        "var_task_duration": list(),
+        "avg_rest_duration": list(),
+        "var_rest_duration": list(),
+        "num_of_transitions": list(),
+        "relative_transition_freq": list(),
+    }
+    for task_id, task in enumerate(TASKS):
+
+        if task == "task-restingstate":
+            continue
+
+        SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task)
+
+        for subj in SUBJECTS:
+            # event data
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+            ).item()
+            Fs_task = task_data["Fs_task"]
+            TR_task = 1 / Fs_task
+
+            task_presence = task_utils.extract_task_presence(
+                event_labels=task_data["event_labels"],
+                TR_task=TR_task,
+                TR_mri=task_data["TR_mri"],
+                binary=True,
+            )
+
+            relative_task_on = task_utils.relative_task_on(task_presence)
+            # task duration
+            avg_task_duration, var_task_duration = task_utils.task_duration(
+                task_presence, task_data["TR_mri"]
+            )
+            # rest duration
+            avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+                task_presence, task_data["TR_mri"]
+            )
+            # freq of transitions
+            num_of_transitions, relative_transition_freq = task_utils.transition_freq(
+                task_presence
+            )
+
+            task_features["task"].append(task)
+            task_features["relative_task_on"].append(relative_task_on)
+            task_features["avg_task_duration"].append(avg_task_duration)
+            task_features["var_task_duration"].append(var_task_duration)
+            task_features["avg_rest_duration"].append(avg_rest_duration)
+            task_features["var_rest_duration"].append(var_rest_duration)
+            task_features["num_of_transitions"].append(num_of_transitions)
+            task_features["relative_transition_freq"].append(relative_transition_freq)
+
+    folder = f"{output_root}"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    np.save(f"{folder}/task_features.npy", task_features)
+
+
+def dFC_feature_extraction_subj_lvl(
+    dFC,
+    task_data,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    """
+    Extract features and target for task presence classification
+    for a single subject.
+    dynamic_pred: "no", "past", "past_and_future"
+    """
+    # dFC features
+    dFC_mat = dFC.get_dFC_mat()
+    TR_array = dFC.TR_array
+    if normalize_dFC:
+        dFC_mat = rank_norm(dFC_mat)
+    dFC_vecs = dFC_mat2vec(dFC_mat)
+
+    # event data
+    task_presence = task_utils.extract_task_presence(
+        event_labels=task_data["event_labels"],
+        TR_task=1 / task_data["Fs_task"],
+        TR_mri=task_data["TR_mri"],
+        TR_array=TR_array,
+        binary=True,
+    )
+
+    features = dFC_vecs
+    target = task_presence.ravel()
+
+    if dynamic_pred == "past":
+        # concat current TR and two TR before of features to predict the current TR of target
+        # ignore the edge case of the first two TRs
+        features = np.concatenate(
+            (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1
+        )
+        features = features[2:, :]
+        target = target[2:]
+    elif dynamic_pred == "past_and_future":
+        # concat current TR and two TR before and after of features to predict the current TR of target
+        # ignore the edge case of the first and last two TRs
+        features = np.concatenate(
+            (
+                features,
+                np.roll(features, 1, axis=0),
+                np.roll(features, 2, axis=0),
+                np.roll(features, -1, axis=0),
+                np.roll(features, -2, axis=0),
+            ),
+            axis=1,
+        )
+        features = features[2:-2, :]
+        target = target[2:-2]
+
+    return features, target
+
+
+def dFC_feature_extraction(
+    task,
+    train_subjects,
+    test_subjects,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    """
+    Extract features and target for task presence classification
+    for all subjects.
+    """
+    X_train = None
+    y_train = None
+    subj_label_train = list()
+    for subj in train_subjects:
+        dFC = np.load(
+            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+        ).item()
+
+        task_data = np.load(
+            f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+        ).item()
+
+        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+            dFC=dFC,
+            task_data=task_data,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+
+        subj_label_train.extend([subj for i in range(X_subj.shape[0])])
+        if X_train is None and y_train is None:
+            X_train = X_subj
+            y_train = y_subj
+        else:
+            X_train = np.concatenate((X_train, X_subj), axis=0)
+            y_train = np.concatenate((y_train, y_subj), axis=0)
+
+    X_test = None
+    y_test = None
+    subj_label_test = list()
+    for subj in test_subjects:
+        dFC = np.load(
+            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+        ).item()
+
+        task_data = np.load(
+            f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+        ).item()
+
+        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+            dFC=dFC,
+            task_data=task_data,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+
+        subj_label_test.extend([subj for i in range(X_subj.shape[0])])
+        if X_test is None and y_test is None:
+            X_test = X_subj
+            y_test = y_subj
+        else:
+            X_test = np.concatenate((X_test, X_subj), axis=0)
+            y_test = np.concatenate((y_test, y_subj), axis=0)
+
+    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
+    subj_label_train = np.array(subj_label_train)
+    subj_label_test = np.array(subj_label_test)
+
+    return (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        subj_label_train,
+        subj_label_test,
+        dFC.measure.measure_name,
+    )
+
+
+def task_presence_classification(
+    task,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    dynamic_pred="no",
+    normalize_dFC=True,
+    train_test_ratio=0.8,
+    explained_var_threshold=0.95,
+):
+    print(f"=============== {task} ===============")
+
+    if task == "task-restingstate":
+        return
+
+    SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id)
+
+    # randomly select train_test_ratio of the subjects for training
+    # and rest for testing using numpy.random.choice
+    train_subjects = np.random.choice(
+        SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False
+    )
+    test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
+    print(
+        f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
+    )
+
+    X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = (
+        dFC_feature_extraction(
+            task=task,
+            train_subjects=train_subjects,
+            test_subjects=test_subjects,
+            dFC_id=dFC_id,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+    )
+
+    # task presence classification
+
+    print("task presence classification ...")
+
+    # find num_PCs
+    pca = PCA(svd_solver="full", whiten=False)
+    pca.fit(X_train)
+    num_PCs = (
+        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
+        + 1
+    )
+
+    # create a pipeline with a knn model to find the best n_neighbors
+    knn = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        KNeighborsClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_neighbors
+    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
+    # use gridsearch to test all values for n_neighbors
+    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
+    # fit model to data
+    knn_gscv.fit(X_train, y_train)
+
+    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
+
+    neigh = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        KNeighborsClassifier(n_neighbors=n_neighbors),
+    ).fit(X_train, y_train)
+
+    ML_RESULT = {
+        "pca": pca,
+        "num_PCs": num_PCs,
+        "cv_results": knn_gscv.cv_results_,
+        "KNN": neigh,
+        "KNN train score": neigh.score(X_train, y_train),
+        "KNN test score": neigh.score(X_test, y_test),
+    }
+
+    print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}")
+    print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}")
+
+    # measure pred score on each subj
+
+    ML_scores = {
+        "subj_id": list(),
+        "group": list(),
+        "task": list(),
+        "dFC method": list(),
+        "KNN accuracy": list(),
+    }
+    for subj in SUBJECTS:
+        ML_scores["subj_id"].append(subj)
+        if subj in train_subjects:
+            ML_scores["group"].append("train")
+            features = X_train[subj_label_train == subj, :]
+            target = y_train[subj_label_train == subj]
+        elif subj in test_subjects:
+            ML_scores["group"].append("test")
+            features = X_test[subj_label_test == subj, :]
+            target = y_test[subj_label_test == subj]
+
+        pred = neigh.predict(features)
+
+        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
+
+        ML_scores["task"].append(task)
+        ML_scores["dFC method"].append(measure_name)
+
+    return ML_RESULT, ML_scores
+
+
+def run_classification(
+    TASKS,
+    roi_root,
+    dFC_root,
+    output_root,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    ML_scores = {
+        "subj_id": list(),
+        "group": list(),
+        "task": list(),
+        "dFC method": list(),
+        "KNN accuracy": list(),
+    }
+    for dFC_id in range(0, 7):
+        print(f"=================== dFC {dFC_id} ===================")
+
+        ML_RESULT = {}
+        for task_id, task in enumerate(TASKS):
+            ML_RESULT_new, ML_scores_new = task_presence_classification(
+                task=task,
+                dFC_id=dFC_id,
+                roi_root=roi_root,
+                dFC_root=dFC_root,
+                dynamic_pred=dynamic_pred,
+                normalize_dFC=normalize_dFC,
+            )
+            ML_RESULT[task] = ML_RESULT_new
+            for key in ML_scores:
+                ML_scores[key].extend(ML_scores_new[key])
+
+        folder = f"{output_root}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
+
+    np.save(f"{folder}/ML_scores_classify.npy", ML_scores)
+
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to apply Machine Learning on dFC results to predict task presence.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+
+    args = parser.parse_args()
+
+    dataset_info_file = args.dataset_info
+
+    # Read global configs
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    print("Task presence prediction started ...")
+
+    TASKS = dataset_info["TASKS"]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["roi_root"]:
+        roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
+    else:
+        roi_root = dataset_info["roi_root"]
+
+    if "{main_root}" in dataset_info["dFC_root"]:
+        dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root)
+    else:
+        dFC_root = dataset_info["dFC_root"]
+
+    if "{main_root}" in dataset_info["ML_root"]:
+        ML_root = dataset_info["ML_root"].replace("{main_root}", main_root)
+    else:
+        ML_root = dataset_info["ML_root"]
+
+    extract_task_features(
+        TASKS=TASKS,
+        roi_root=roi_root,
+        output_root=ML_root,
+    )
+    run_classification(
+        TASKS=TASKS,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        output_root=ML_root,
+        dynamic_pred="no",
+        normalize_dFC=True,
+    )
+
+    print("Task presence prediction CODE finished running.")
+
+#######################################################################################

From bf1e25b96950d09b0133b0138e133269e381c499 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 29 May 2024 15:18:45 -0400
Subject: [PATCH 020/401] add run to ML.py

---
 task_dFC/ML.py | 106 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 28 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 2e19811..9f14283 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -16,7 +16,7 @@
 #######################################################################################
 
 
-def find_available_subjects(dFC_root, task, dFC_id=None):
+def find_available_subjects(dFC_root, task, run=None, dFC_id=None):
     """
     Find the subjects that have dFC results for the given task and dFC_id (method).
     """
@@ -31,6 +31,8 @@ def find_available_subjects(dFC_root, task, dFC_id=None):
             ALL_DFC_FILES = [
                 dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
             ]
+        if run is not None:
+            ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if run in dFC_file]
         ALL_DFC_FILES.sort()
         if len(ALL_DFC_FILES) > 0:
             SUBJECTS.append(subj_folder)
@@ -165,24 +167,34 @@ def dFC_feature_extraction(
     dFC_id,
     roi_root,
     dFC_root,
+    run=None,
     dynamic_pred="no",
     normalize_dFC=True,
 ):
     """
     Extract features and target for task presence classification
     for all subjects.
+    if run is specified, dFC results for that run will be used.
     """
     X_train = None
     y_train = None
     subj_label_train = list()
     for subj in train_subjects:
-        dFC = np.load(
-            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-        ).item()
-
-        task_data = np.load(
-            f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-        ).item()
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
 
         X_subj, y_subj = dFC_feature_extraction_subj_lvl(
             dFC=dFC,
@@ -203,13 +215,21 @@ def dFC_feature_extraction(
     y_test = None
     subj_label_test = list()
     for subj in test_subjects:
-        dFC = np.load(
-            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-        ).item()
-
-        task_data = np.load(
-            f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-        ).item()
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
 
         X_subj, y_subj = dFC_feature_extraction_subj_lvl(
             dFC=dFC,
@@ -246,17 +266,26 @@ def task_presence_classification(
     dFC_id,
     roi_root,
     dFC_root,
+    run=None,
     dynamic_pred="no",
     normalize_dFC=True,
     train_test_ratio=0.8,
     explained_var_threshold=0.95,
 ):
-    print(f"=============== {task} ===============")
+    """
+    perform task presence classification using KNN for a given task and dFC method and run.
+    """
+    if run is None:
+        print(f"=============== {task} ===============")
+    else:
+        print(f"=============== {task} {run} ===============")
 
     if task == "task-restingstate":
         return
 
-    SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id)
+    SUBJECTS = find_available_subjects(
+        dFC_root=dFC_root, task=task, run=run, dFC_id=dFC_id
+    )
 
     # randomly select train_test_ratio of the subjects for training
     # and rest for testing using numpy.random.choice
@@ -276,6 +305,7 @@ def task_presence_classification(
             dFC_id=dFC_id,
             roi_root=roi_root,
             dFC_root=dFC_root,
+            run=run,
             dynamic_pred=dynamic_pred,
             normalize_dFC=normalize_dFC,
         )
@@ -332,6 +362,7 @@ def task_presence_classification(
         "subj_id": list(),
         "group": list(),
         "task": list(),
+        "run": list(),
         "dFC method": list(),
         "KNN accuracy": list(),
     }
@@ -351,6 +382,7 @@ def task_presence_classification(
         ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
 
         ML_scores["task"].append(task)
+        ML_scores["run"].append(run)
         ML_scores["dFC method"].append(measure_name)
 
     return ML_RESULT, ML_scores
@@ -358,6 +390,7 @@ def task_presence_classification(
 
 def run_classification(
     TASKS,
+    RUNS,
     roi_root,
     dFC_root,
     output_root,
@@ -368,6 +401,7 @@ def run_classification(
         "subj_id": list(),
         "group": list(),
         "task": list(),
+        "run": list(),
         "dFC method": list(),
         "KNN accuracy": list(),
     }
@@ -376,17 +410,25 @@ def run_classification(
 
         ML_RESULT = {}
         for task_id, task in enumerate(TASKS):
-            ML_RESULT_new, ML_scores_new = task_presence_classification(
-                task=task,
-                dFC_id=dFC_id,
-                roi_root=roi_root,
-                dFC_root=dFC_root,
-                dynamic_pred=dynamic_pred,
-                normalize_dFC=normalize_dFC,
-            )
-            ML_RESULT[task] = ML_RESULT_new
-            for key in ML_scores:
-                ML_scores[key].extend(ML_scores_new[key])
+            if RUNS is None:
+                RUNS = {task: [None]}
+            ML_RESULT[task] = {}
+            for run in RUNS[task]:
+                ML_RESULT_new, ML_scores_new = task_presence_classification(
+                    task=task,
+                    dFC_id=dFC_id,
+                    roi_root=roi_root,
+                    dFC_root=dFC_root,
+                    run=run,
+                    dynamic_pred=dynamic_pred,
+                    normalize_dFC=normalize_dFC,
+                )
+                if run is None:
+                    ML_RESULT[task] = ML_RESULT_new
+                else:
+                    ML_RESULT[task][run] = ML_RESULT_new
+                for key in ML_scores:
+                    ML_scores[key].extend(ML_scores_new[key])
 
         folder = f"{output_root}"
         if not os.path.exists(folder):
@@ -419,6 +461,13 @@ def run_classification(
     print("Task presence prediction started ...")
 
     TASKS = dataset_info["TASKS"]
+    if "RUNS" in dataset_info:
+        if dataset_info["RUNS"] is not None:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+    else:
+        RUNS = None
 
     if "{dataset}" in dataset_info["main_root"]:
         main_root = dataset_info["main_root"].replace(
@@ -449,6 +498,7 @@ def run_classification(
     )
     run_classification(
         TASKS=TASKS,
+        RUNS=RUNS,
         roi_root=roi_root,
         dFC_root=dFC_root,
         output_root=ML_root,

From 3043c43258c0e9b3d6c4d6688d0cb6d4009248a7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 29 May 2024 16:13:47 -0400
Subject: [PATCH 021/401] minor fix

---
 task_dFC/ML.py | 92 +++++++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 9f14283..0e5b626 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -39,11 +39,12 @@ def find_available_subjects(dFC_root, task, run=None, dFC_id=None):
     return SUBJECTS
 
 
-def extract_task_features(TASKS, roi_root, output_root):
+def extract_task_features(TASKS, RUNS, roi_root, output_root):
     """
     Extract task features from the event data."""
     task_features = {
         "task": list(),
+        "run": list(),
         "relative_task_on": list(),
         "avg_task_duration": list(),
         "var_task_duration": list(),
@@ -57,45 +58,57 @@ def extract_task_features(TASKS, roi_root, output_root):
         if task == "task-restingstate":
             continue
 
-        SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task)
+        if RUNS is None:
+            RUNS = {task: [None]}
+        for run in RUNS[task]:
 
-        for subj in SUBJECTS:
-            # event data
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-            ).item()
-            Fs_task = task_data["Fs_task"]
-            TR_task = 1 / Fs_task
-
-            task_presence = task_utils.extract_task_presence(
-                event_labels=task_data["event_labels"],
-                TR_task=TR_task,
-                TR_mri=task_data["TR_mri"],
-                binary=True,
-            )
-
-            relative_task_on = task_utils.relative_task_on(task_presence)
-            # task duration
-            avg_task_duration, var_task_duration = task_utils.task_duration(
-                task_presence, task_data["TR_mri"]
-            )
-            # rest duration
-            avg_rest_duration, var_rest_duration = task_utils.rest_duration(
-                task_presence, task_data["TR_mri"]
-            )
-            # freq of transitions
-            num_of_transitions, relative_transition_freq = task_utils.transition_freq(
-                task_presence
-            )
-
-            task_features["task"].append(task)
-            task_features["relative_task_on"].append(relative_task_on)
-            task_features["avg_task_duration"].append(avg_task_duration)
-            task_features["var_task_duration"].append(var_task_duration)
-            task_features["avg_rest_duration"].append(avg_rest_duration)
-            task_features["var_rest_duration"].append(var_rest_duration)
-            task_features["num_of_transitions"].append(num_of_transitions)
-            task_features["relative_transition_freq"].append(relative_transition_freq)
+            SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, run=run)
+
+            for subj in SUBJECTS:
+                # event data
+                if run is None:
+                    task_data = np.load(
+                        f"{roi_root}/{subj}/{subj}_{task}_task-data.npy",
+                        allow_pickle="TRUE",
+                    ).item()
+                else:
+                    task_data = np.load(
+                        f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
+                        allow_pickle="TRUE",
+                    ).item()
+                Fs_task = task_data["Fs_task"]
+                TR_task = 1 / Fs_task
+
+                task_presence = task_utils.extract_task_presence(
+                    event_labels=task_data["event_labels"],
+                    TR_task=TR_task,
+                    TR_mri=task_data["TR_mri"],
+                    binary=True,
+                )
+
+                relative_task_on = task_utils.relative_task_on(task_presence)
+                # task duration
+                avg_task_duration, var_task_duration = task_utils.task_duration(
+                    task_presence, task_data["TR_mri"]
+                )
+                # rest duration
+                avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+                    task_presence, task_data["TR_mri"]
+                )
+                # freq of transitions
+                num_of_transitions, relative_transition_freq = task_utils.transition_freq(
+                    task_presence
+                )
+
+                task_features["task"].append(task)
+                task_features["run"].append(run)
+                task_features["relative_task_on"].append(relative_task_on)
+                task_features["avg_task_duration"].append(avg_task_duration)
+                task_features["var_task_duration"].append(var_task_duration)
+                task_features["avg_rest_duration"].append(avg_rest_duration)
+                task_features["var_rest_duration"].append(var_rest_duration)
+                task_features["num_of_transitions"].append(num_of_transitions)
+                task_features["relative_transition_freq"].append(relative_transition_freq)
 
     folder = f"{output_root}"
     if not os.path.exists(folder):
@@ -493,6 +506,7 @@ def run_classification(
 
     extract_task_features(
         TASKS=TASKS,
+        RUNS=RUNS,
         roi_root=roi_root,
         output_root=ML_root,
     )

From 351c5f23749f7806f0df7d4a1c94b738dc486e51 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 29 May 2024 17:00:33 -0400
Subject: [PATCH 022/401] fix event_types issue

---
 pydfc/task_utils.py             | 15 ++++++++++-----
 task_dFC/nifti_to_roi_signal.py | 10 ++--------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index a24b3cf..8598ca3 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -19,10 +19,10 @@
 
 
 def events_time_to_labels(
-    events, TR_mri, num_time_mri, event_types=[], oversampling=50, return_0_1=False
+    events, TR_mri, num_time_mri, event_types=None, oversampling=50, return_0_1=False
 ):
     """
-    event_types is a list of event types to be considered. If None, 0 and 1s will be returned.
+    event_types is a list of event types to be considered. If None, it will found based on events.
     Assigns the longest event in each TR to that TR (in the interval from last TR to current TR).
     It assumes that the first time point is TR0 which corresponds to [0 sec, TR sec] interval.
     oversampling: number of samples per TR_mri to improve the time resolution of tasks
@@ -43,6 +43,9 @@ def events_time_to_labels(
         events[0, trial_type_idx] == "trial_type"
     ), "Something went wrong with the events file! The trial_type column was not found!"
 
+    if event_types is None:
+        event_types = ["rest"] + list(np.unique(events[1:, trial_type_idx]))
+
     Fs = float(1 / TR_mri) * oversampling
     num_time_task = int(num_time_mri * oversampling)
     event_labels = np.zeros((num_time_task, 1))
@@ -52,8 +55,10 @@ def events_time_to_labels(
             continue
 
         if events[i, trial_type_idx] in event_types:
-            if events[i, trial_type_idx] == "rest":
-                warnings.warn("trial types should not include 'rest'")
+            if ("rest" in events[i, trial_type_idx]) or (
+                "Rest" in events[i, trial_type_idx]
+            ):
+                raise ValueError("trial types should not include 'rest'")
             start_time = float(events[i, onset_idx])
             end_time = float(events[i, onset_idx]) + float(events[i, duration_idx])
             start_timepoint = int(np.rint(start_time * Fs))
@@ -65,7 +70,7 @@ def events_time_to_labels(
     if return_0_1:
         event_labels = np.multiply(event_labels != 0, 1)
 
-    return event_labels, Fs
+    return event_labels, Fs, event_types
 
 
 ################################# Visualization Functions ####################################
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 7ee8870..507e714 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -109,17 +109,11 @@ def run_roi_signal_extraction(
             events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
             events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
             # get the event labels
-            # check that "rest" does not already exist in the event types
-            if any(
-                ["rest" in event_type for event_type in list(np.unique(events[1:, 2]))]
-            ):
-                raise ValueError("Event types should not include 'rest'")
-            event_types = ["rest"] + list(np.unique(events[1:, 2]))
-            event_labels, Fs_task = task_utils.events_time_to_labels(
+            event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
                 events=events,
                 TR_mri=TR_mri,
                 num_time_mri=num_time_mri,
-                event_types=event_types,
+                event_types=None,
                 oversampling=oversampling,
                 return_0_1=False,
             )

From 4eeb16c3a0a39da81c92a925f6aa4b55ab325725 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 29 May 2024 19:30:49 -0400
Subject: [PATCH 023/401] set binarizing method to median

---
 pydfc/task_utils.py | 16 ++++++++++++----
 task_dFC/ML.py      |  2 ++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 8598ca3..fad46d9 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -237,7 +237,9 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"):
     return events_hrf_ds
 
 
-def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=True):
+def extract_task_presence(
+    event_labels, TR_task, TR_mri, TR_array=None, binary=True, binarizing_method="median"
+):
     """
     event_labels: event labels including 0 and event ids at the time each event happens
     TR_task: TR of task
@@ -247,6 +249,7 @@ def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=T
     This function extracts the task presence from the event labels and returns it in the same time points as the dFC data
     It also downsamples the task presence to the time points of the dFC data
     if binary is True, the task presence is binarized using the mean of the task presence
+    binarizing_method: 'median' or 'mean'
     """
 
     # event_labels_all_task is all conditions together, rest vs. task times
@@ -266,9 +269,14 @@ def extract_task_presence(event_labels, TR_task, TR_mri, TR_array=None, binary=T
         event_labels_all_task_hrf = event_labels_all_task_hrf[:, 1]
 
     if binary:
-        task_presence = np.where(
-            event_labels_all_task_hrf > np.mean(event_labels_all_task_hrf), 1, 0
-        )
+        if binarizing_method == "median":
+            task_presence = np.where(
+                event_labels_all_task_hrf > np.median(event_labels_all_task_hrf), 1, 0
+            )
+        elif binarizing_method == "mean":
+            task_presence = np.where(
+                event_labels_all_task_hrf > np.mean(event_labels_all_task_hrf), 1, 0
+            )
     else:
         task_presence = event_labels_all_task_hrf
 
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 0e5b626..d70b4f9 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -84,6 +84,7 @@ def extract_task_features(TASKS, RUNS, roi_root, output_root):
                     TR_task=TR_task,
                     TR_mri=task_data["TR_mri"],
                     binary=True,
+                    binarizing_method="median",
                 )
 
                 relative_task_on = task_utils.relative_task_on(task_presence)
@@ -141,6 +142,7 @@ def dFC_feature_extraction_subj_lvl(
         TR_mri=task_data["TR_mri"],
         TR_array=TR_array,
         binary=True,
+        binarizing_method="median",
     )
 
     features = dFC_vecs

From c042b44d49d19e9e6ac4690f27d22af96e9f7dcf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 29 May 2024 21:44:57 -0400
Subject: [PATCH 024/401] change simul tasks

---
 simul_dFC/task_data_simulator.py | 38 +++++---------------------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 7823932..7fe64c3 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -45,24 +45,11 @@
 print(f"subject-level simulation started running ... for subject: {subj_id} ...")
 
 all_task_info = {
-    "task-midFreqMidRest": {
-        "task_name": "task-midFreqMidRest",
-        "onset_time": onset_time,
-        "task_duration": 12.0,
-        "task_block_duration": 30.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
     "task-lowFreqLongRest": {
         "task_name": "task-lowFreqLongRest",
         "onset_time": onset_time,
-        "task_duration": 20.0,
-        "task_block_duration": 40.0,
+        "task_duration": 8.0,
+        "task_block_duration": 20.0,
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
@@ -74,21 +61,8 @@
     "task-lowFreqShortRest": {
         "task_name": "task-lowFreqShortRest",
         "onset_time": onset_time,
-        "task_duration": 20.0,
-        "task_block_duration": 25.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-lowFreqShortTask": {
-        "task_name": "task-lowFreqShortTask",
-        "onset_time": onset_time,
-        "task_duration": 5.0,
-        "task_block_duration": 30.0,
+        "task_duration": 12.0,
+        "task_block_duration": 20.0,
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
@@ -123,11 +97,11 @@
         "conn_speed": conn_speed,
         "dt": dt,
     },
-    "task-midFreqMidRestNoisy": {
+    "task-lowFreqShortRestNoisy": {
         "task_name": "task-midFreqMidRestNoisy",
         "onset_time": onset_time,
         "task_duration": 12.0,
-        "task_block_duration": 30.0,
+        "task_block_duration": 20.0,
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,

From 963fcfdd487639b62d7cd9293c770bebbc371bfa Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 30 May 2024 12:10:04 -0400
Subject: [PATCH 025/401] handle Rest in events

---
 pydfc/task_utils.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index fad46d9..a807da1 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -44,7 +44,15 @@ def events_time_to_labels(
     ), "Something went wrong with the events file! The trial_type column was not found!"
 
     if event_types is None:
-        event_types = ["rest"] + list(np.unique(events[1:, trial_type_idx]))
+        event_types = list(np.unique(events[1:, trial_type_idx]))
+        # if rest is already there, remove it
+        if "rest" in event_types:
+            warnings.warn("rest is already in the event types")
+            event_types.remove("rest")
+        if "Rest" in event_types:
+            warnings.warn("Rest is already in the event types")
+            event_types.remove("Rest")
+        event_types = ["rest"] + event_types
 
     Fs = float(1 / TR_mri) * oversampling
     num_time_task = int(num_time_mri * oversampling)
@@ -58,7 +66,7 @@ def events_time_to_labels(
             if ("rest" in events[i, trial_type_idx]) or (
                 "Rest" in events[i, trial_type_idx]
             ):
-                raise ValueError("trial types should not include 'rest'")
+                continue
             start_time = float(events[i, onset_idx])
             end_time = float(events[i, onset_idx]) + float(events[i, duration_idx])
             start_timepoint = int(np.rint(start_time * Fs))

From f75da420401cc389a0037f6aeb3057410a27c499 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 30 May 2024 13:25:07 -0400
Subject: [PATCH 026/401] add ses

---
 task_dFC/nifti_to_roi_signal.py | 78 ++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 507e714..5c19561 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -12,16 +12,26 @@
 
 ################################# FUNCTIONS #################################
 def run_roi_signal_extraction(
-    subj, task, main_root, fmriprep_root, bold_suffix, output_root
+    subj,
+    task,
+    main_root,
+    fmriprep_root,
+    bold_suffix,
+    output_root,
+    session="",
 ):
     """
     Extract ROI signals and task labels for a given subject and task
+    and optionally session.
     """
     # find the func file for this subject and task
     try:
-        ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
+        if session == "":
+            ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
+        else:
+            ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/{session}/func/")
     except FileNotFoundError:
-        print(f"Subject {subj} not found in {fmriprep_root}")
+        print(f"Subject {subj} {session} not found in {fmriprep_root}")
         return
 
     ALL_TASK_FILES = [
@@ -32,7 +42,7 @@ def run_roi_signal_extraction(
 
     if not len(ALL_TASK_FILES) >= 1:
         # if the func file is not found, exclude the subject
-        print(f"Func file not found for {subj} {task}")
+        print(f"Func file not found for {subj} {session} {task}")
         return
 
     # there might be multiple runs for the same task
@@ -58,11 +68,13 @@ def run_roi_signal_extraction(
 
     for run in RUNS:
         task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0]
-        nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}"
-        info_file = (
-            f"{main_root}/bids/{subj}/func/{task_file.replace(bold_suffix, '_bold.json')}"
-        )
-
+        if session == "":
+            nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}"
+            task_events_root = f"{main_root}/bids/{subj}/func"
+        else:
+            nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}"
+            task_events_root = f"{main_root}/bids/{subj}/{session}/func"
+        info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}"
         ################################# LOAD JSON INFO #########################
         # Opening JSON file as a dictionary
         f = open(info_file)
@@ -91,7 +103,6 @@ def run_roi_signal_extraction(
             task_labels = np.zeros((int(num_time_mri * oversampling), 1))
             Fs_task = float(1 / TR_mri) * oversampling
         else:
-            task_events_root = f"{main_root}/bids/{subj}/func"
             ALL_EVENTS_FILES = os.listdir(task_events_root)
             ALL_EVENTS_FILES = [
                 file_i
@@ -99,11 +110,12 @@ def run_roi_signal_extraction(
                 if (subj in file_i)
                 and (task in file_i)
                 and (run in file_i)
+                and (session in file_i)
                 and ("events.tsv" in file_i)
             ]
             if not len(ALL_EVENTS_FILES) == 1:
                 # if the events file is not found, exclude the subject
-                print(f"Events file not found for {subj} {task} {run}")
+                print(f"Events file not found for {subj} {session} {task} {run}")
                 return
             # load the tsv events file
             events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
@@ -134,14 +146,23 @@ def run_roi_signal_extraction(
             "TR_mri": TR_mri,
             "num_time_mri": num_time_mri,
         }
+
+        if session == "":
+            subj_session_prefix = f"{subj}"
+            output_dir = f"{output_root}/{subj}"
+        else:
+            subj_session_prefix = f"{subj}_{session}"
+            output_dir = f"{output_root}/{subj}/{session}"
+
         if multi_run_flag:
-            output_file_prefix = f"{subj}_{task}_{run}"
+            output_file_prefix = f"{subj_session_prefix}_{task}_{run}"
         else:
-            output_file_prefix = f"{subj}_{task}"
-        if not os.path.exists(f"{output_root}/{subj}/"):
-            os.makedirs(f"{output_root}/{subj}/")
-        np.save(f"{output_root}/{subj}/{output_file_prefix}_time-series.npy", time_series)
-        np.save(f"{output_root}/{subj}/{output_file_prefix}_task-data.npy", task_data)
+            output_file_prefix = f"{subj_session_prefix}_{task}"
+
+        if not os.path.exists(f"{output_dir}/"):
+            os.makedirs(f"{output_dir}/")
+        np.save(f"{output_dir}/{output_file_prefix}_time-series.npy", time_series)
+        np.save(f"{output_dir}/{output_file_prefix}_task-data.npy", task_data)
 
 
 ########################################################################################
@@ -171,6 +192,9 @@ def run_roi_signal_extraction(
     )
 
     TASKS = dataset_info["TASKS"]
+    SESSIONS = dataset_info["SESSIONS"]
+    if SESSIONS is None:
+        SESSIONS = [""]
 
     if "{dataset}" in dataset_info["main_root"]:
         main_root = dataset_info["main_root"].replace(
@@ -189,15 +213,17 @@ def run_roi_signal_extraction(
     else:
         output_root = dataset_info["roi_root"]
 
-    for task in TASKS:
-        run_roi_signal_extraction(
-            subj=participant_id,
-            task=task,
-            main_root=main_root,
-            fmriprep_root=fmriprep_root,
-            bold_suffix=dataset_info["bold_suffix"],
-            output_root=output_root,
-        )
+    for session in SESSIONS:
+        for task in TASKS:
+            run_roi_signal_extraction(
+                subj=participant_id,
+                task=task,
+                main_root=main_root,
+                fmriprep_root=fmriprep_root,
+                bold_suffix=dataset_info["bold_suffix"],
+                output_root=output_root,
+                session=session,
+            )
 
     print(
         f"subject-level ROI signal extraction CODE finished running ... for subject: {participant_id} ..."

From 8d13528853b5612775a5869fe46c59a0cd393c96 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 30 May 2024 18:09:04 -0400
Subject: [PATCH 027/401] change FCS estimate

---
 pydfc/data_loader.py            |  68 +++++-----
 task_dFC/FCS_estimate.py        | 229 ++++++++++++++++++--------------
 task_dFC/dFC_assessment.py      |   2 +-
 task_dFC/nifti_to_roi_signal.py |   2 +-
 4 files changed, 162 insertions(+), 139 deletions(-)

diff --git a/pydfc/data_loader.py b/pydfc/data_loader.py
index e14b2fc..fba1ced 100644
--- a/pydfc/data_loader.py
+++ b/pydfc/data_loader.py
@@ -322,25 +322,18 @@ def multi_nifti2timeseries(
 def load_TS(
     data_root,
     file_name,
-    SESSIONs,
     subj_id2load=None,
     task=None,
+    session=None,
     run=None,
 ):
     """
     load a TIME_SERIES object from a .npy file
-    if SESSIONs is a list, it will load all the sessions,
-        if it is a string, it will load that session
     if subj_id2load is None, it will load all the subjects
     file_name: name of the file to load
-        format example: {subj_id}_{task}_{run}_time-series.npy
+        format example: {subj_id}_{session}_{task}_{run}_time-series.npy
         (keep the {} for the variables)
     """
-    # check if SESSIONs is a list or a string
-    flag = False
-    if type(SESSIONs) is str:
-        SESSIONs = [SESSIONs]
-        flag = True
 
     if subj_id2load is None:
         SUBJECTS = find_subj_list(data_root)
@@ -348,37 +341,42 @@ def load_TS(
         assert "sub-" in subj_id2load, "subj_id2load must start with 'sub-'"
         SUBJECTS = [subj_id2load]
 
-    TS = {}
-    for session in SESSIONs:
-        TS[session] = None
-        for subj in SUBJECTS:
-            subj_fldr = subj
-            # make the file_name
-            TS_file = deepcopy(file_name)
-            if "{subj_id}" in file_name:
-                TS_file = TS_file.replace("{subj_id}", subj)
-            if "{task}" in file_name:
-                assert task is not None, "task must be provided"
-                TS_file = TS_file.replace("{task}", task)
-            if "{run}" in file_name:
-                assert run is not None, "run must be provided"
-                TS_file = TS_file.replace("{run}", run)
-
-            try:
+    TS = None
+    for subj in SUBJECTS:
+        subj_fldr = subj
+        # make the file_name
+        TS_file = deepcopy(file_name)
+        if "{subj_id}" in file_name:
+            TS_file = TS_file.replace("{subj_id}", subj)
+        if "{task}" in file_name:
+            assert task is not None, "task must be provided"
+            TS_file = TS_file.replace("{task}", task)
+        if "{session}" in file_name:
+            assert session is not None, "session must be provided"
+            TS_file = TS_file.replace("{session}", session)
+        if "{run}" in file_name:
+            assert run is not None, "run must be provided"
+            TS_file = TS_file.replace("{run}", run)
+
+        try:
+            if session is None:
                 time_series = np.load(
                     f"{data_root}/{subj_fldr}/{TS_file}", allow_pickle="True"
                 ).item()
-            except FileNotFoundError:
-                print(f"File {TS_file} not found for {subj}")
-                continue
-
-            if TS[session] is None:
-                TS[session] = time_series
             else:
-                TS[session].concat_ts(time_series)
+                time_series = np.load(
+                    f"{data_root}/{subj_fldr}/{session}/{TS_file}",
+                    allow_pickle="True",
+                ).item()
+        except FileNotFoundError:
+            print(f"File {TS_file} not found for {subj}")
+            continue
+
+        if TS is None:
+            TS = time_series
+        else:
+            TS.concat_ts(time_series)
 
-    if flag:
-        return TS[SESSIONs[0]]
     return TS
 
 
diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index d171085..027177a 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -1,3 +1,5 @@
+import argparse
+import json
 import os
 import time
 import warnings
@@ -12,121 +14,54 @@
 os.environ["NUMEXPR_NUM_THREADS"] = "16"
 os.environ["OMP_NUM_THREADS"] = "16"
 
-################################# Parameters #################################
-# data paths
-dataset = "ds003242"
-# main_root = f"../../DATA/{dataset}" # for local
-main_root = f"/data/origami/dFC/DATA/task-based/openneuro/{dataset}"  # for server
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-output_root = f"{main_root}/derivatives/fitted_MEASURES"
-
-# for consistency we use 0 for resting state
-TASKS = ["task-CIC", "task-midloc"]
-
-# default RUNS = None
-RUNS = None
-RUNS = {
-    "task-CIC": ["run-001", "run-002", "run-003", "run-004", "run-005", "run-006"],
-    "task-midloc": ["run-001"],
-}
-
-job_id = int(os.getenv("SGE_TASK_ID"))
-TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
-if TASK_id >= len(TASKS):
-    print("TASK_id out of TASKS")
-    exit()
-task = TASKS[TASK_id]
-
-###### MEASUREMENT PARAMETERS ######
-
-# W is in sec
-
-params_methods = {
-    # Sliding Parameters
-    "W": 12,
-    "n_overlap": 1.0,
-    "sw_method": "pear_corr",
-    "tapered_window": True,
-    # TIME_FREQ
-    "TF_method": "WTC",
-    # CLUSTERING AND DHMM
-    "clstr_base_measure": "SlidingWindow",
-    # HMM
-    "hmm_iter": 20,
-    "dhmm_obs_state_ratio": 16 / 24,
-    # State Parameters
-    "n_states": 5,
-    "n_subj_clstrs": 10,
-    # Parallelization Parameters
-    "n_jobs": 2,
-    "verbose": 0,
-    "backend": "loky",
-    # SESSION
-    "session": task,
-    # Hyper Parameters
-    "normalization": True,
-    "num_subj": None,
-    "num_time_point": None,
-}
-
-###### HYPER PARAMETERS ALTERNATIVE ######
-
-MEASURES_name_lst = [
-    "SlidingWindow",
-    "Time-Freq",
-    "CAP",
-    "ContinuousHMM",
-    "Windowless",
-    "Clustering",
-    "DiscreteHMM",
-]
-
-alter_hparams = {
-    # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'],
-    # 'n_overlap': [0, 0.25, 0.75, 1],
-    # 'n_states': [6, 16],
-    # # 'normalization': [],
-    # 'num_subj': [50, 100, 200],
-    # 'num_select_nodes': [30, 50, 333],
-    # 'num_time_point': [800, 1000],
-    # 'Fs_ratio': [0.50, 0.75, 1.5],
-    # 'noise_ratio': [1.00, 2.00, 3.00],
-    # 'num_realization': []
-}
-
-###### MultiAnalysis PARAMETERS ######
-
-params_multi_analysis = {
-    # Parallelization Parameters
-    "n_jobs": None,
-    "verbose": 0,
-    "backend": "loky",
-}
-
-if RUNS is None:
-    RUNS = {task: [None]}
-for run in RUNS[task]:
+########################################################################################
+
+
+def run_FCS_estimate(
+    params_methods,
+    MEASURES_name_lst,
+    alter_hparams,
+    params_multi_analysis,
+    task,
+    roi_root,
+    output_root,
+    session=None,
+    run=None,
+):
+    if session is None:
+        output_dir = f"{output_root}"
+    else:
+        output_dir = f"{output_root}/{session}"
+
     if run is None:
         print(f"TASK: {task} started ...")
-        file_suffix = f"{task}"
-        BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+        if session is None:
+            BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+            file_suffix = f"{task}"
+        else:
+            BOLD_file_name = "{subj_id}_{session}_{task}_time-series.npy"
+            file_suffix = f"{session}_{task}"
     else:
         print(f"TASK: {task}, RUN: {run} started ...")
-        file_suffix = f"{task}_{run}"
-        BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+        if session is None:
+            BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+            file_suffix = f"{task}_{run}"
+        else:
+            BOLD_file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy"
+            file_suffix = f"{session}_{task}_{run}"
     ################################# LOAD DATA #################################
     BOLD = data_loader.load_TS(
         data_root=roi_root,
         file_name=BOLD_file_name,
-        SESSIONs=task,
         subj_id2load=None,
         task=task,
+        session=session,
         run=run,
     )
     ################################ Measures of dFC #################################
 
     MA = MultiAnalysis(
-        analysis_name=f"task-based-dFC-{dataset}-{file_suffix}", **params_multi_analysis
+        analysis_name=f"task-based-dFC-{file_suffix}", **params_multi_analysis
     )
 
     MEASURES_lst = MA.measures_initializer(
@@ -151,9 +86,99 @@
         # Save
         if not os.path.exists(f"{output_root}"):
             os.makedirs(f"{output_root}")
-        np.save(f"{output_root}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
+        np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-    np.save(f"{output_root}/multi-analysis_{file_suffix}.npy", MA)
+    np.save(f"{output_dir}/multi-analysis_{file_suffix}.npy", MA)
+
+
+########################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to fit dFC methods for a given task.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+    parser.add_argument("--methods_config", type=str, help="methods config file")
+
+    args = parser.parse_args()
+
+    dataset_info_file = args.dataset_info
+    methods_config_file = args.methods_config
+
+    # Read dataset info
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    # Read methods config
+    with open(methods_config_file, "r") as f:
+        methods_config = json.load(f)
 
+    TASKS = dataset_info["TASKS"]
+
+    job_id = int(os.getenv("SGE_TASK_ID"))
+    TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
+    if TASK_id >= len(TASKS):
+        print("TASK_id out of TASKS")
+        exit()
+    task = TASKS[TASK_id]
+
+    print(f"FCS estimation CODE started running ... for task: {task} ...")
+
+    SESSIONS = dataset_info["SESSIONS"]
+    if SESSIONS is None:
+        SESSIONS = [None]
+    RUNS = dataset_info["RUNS"]
+    if RUNS is None:
+        RUNS = {task: [None]}
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["fmriprep_root"]:
+        fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
+    else:
+        fmriprep_root = dataset_info["fmriprep_root"]
+
+    if "{main_root}" in dataset_info["roi_root"]:
+        roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
+    else:
+        roi_root = dataset_info["roi_root"]
+
+    if "{main_root}" in dataset_info["fitted_measures_root"]:
+        fitted_measures_root = dataset_info["fitted_measures_root"].replace(
+            "{main_root}", main_root
+        )
+    else:
+        fitted_measures_root = dataset_info["fitted_measures_root"]
+
+    # methods params
+    params_methods = methods_config["params_methods"]
+    MEASURES_name_lst = methods_config["MEASURES_name_lst"]
+    alter_hparams = methods_config["alter_hparams"]
+    params_multi_analysis = methods_config["params_multi_analysis"]
+
+    for session in SESSIONS:
+        for run in RUNS[task]:
+            run_FCS_estimate(
+                params_methods=params_methods,
+                MEASURES_name_lst=MEASURES_name_lst,
+                alter_hparams=alter_hparams,
+                params_multi_analysis=params_multi_analysis,
+                task=task,
+                roi_root=roi_root,
+                output_root=fitted_measures_root,
+                session=session,
+                run=run,
+            )
+
+    print(f"FCS estimation CODE finished running ... for task: {task} ...")
 #################################################################################
diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index b3068b0..84c77a1 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -158,7 +158,7 @@ def run_dFC_assess(
     dataset_info_file = args.dataset_info
     participant_id = args.participant_id
 
-    # Read global configs
+    # Read dataset info
     with open(dataset_info_file, "r") as f:
         dataset_info = json.load(f)
 
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 5c19561..fc54d9c 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -183,7 +183,7 @@ def run_roi_signal_extraction(
     dataset_info_file = args.dataset_info
     participant_id = args.participant_id
 
-    # Read global configs
+    # Read dataset info
     with open(dataset_info_file, "r") as f:
         dataset_info = json.load(f)
 

From 41e78b67c524a12488b69625b8bc1c611c1ebd63 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 30 May 2024 18:36:37 -0400
Subject: [PATCH 028/401] minor fix

---
 task_dFC/FCS_estimate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 027177a..67f2591 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -84,8 +84,8 @@ def run_FCS_estimate(
         print("FCS estimation done.")
 
         # Save
-        if not os.path.exists(f"{output_root}"):
-            os.makedirs(f"{output_root}")
+        if not os.path.exists(f"{output_dir}"):
+            os.makedirs(f"{output_dir}")
         np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")

From a9107efbcc8e37d35138d8b054bfc11545c00f85 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 31 May 2024 12:05:23 -0400
Subject: [PATCH 029/401] event_types fix

---
 pydfc/simul_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index d716498..b4e155f 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -184,7 +184,6 @@ def create_simul_task_info(
     """
     ####################### EXTRACT TASK LABELS #######################
     events = []
-    event_types = ["rest", "task"]
 
     # using onset, task_duration, task_block_duration to create the events
     events.append(["onset", "duration", "trial_type"])
@@ -194,11 +193,10 @@ def create_simul_task_info(
         t += task_block_duration
     events = np.array(events)
 
-    event_labels, Fs_task = task_utils.events_time_to_labels(
+    event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
         events=events,
         TR_mri=TR_mri,
         num_time_mri=num_time_mri,
-        event_types=event_types,
         oversampling=oversampling,
         return_0_1=False,
     )

From c16b59d25dec901eb8ca90979a5c0fe5bf5dd03b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 31 May 2024 12:21:08 -0400
Subject: [PATCH 030/401] minor fix

---
 simul_dFC/task_data_simulator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 7fe64c3..bf8709e 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -22,7 +22,7 @@
 # data paths
 dataset = "ds000001"
 # main_root = f"./DATA/{dataset}" # for local
-main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
+main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}"  # for server
 output_root = f"{main_root}/derivatives/ROI_timeseries"
 
 # simulation parameters

From 639a2d5c071ff1e5bf75262b00b5687f80a10ae4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 31 May 2024 12:24:57 -0400
Subject: [PATCH 031/401] minor fix

---
 simul_dFC/task_data_simulator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index bf8709e..24aa92a 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -20,7 +20,7 @@
 ################################# Parameters ####################################
 
 # data paths
-dataset = "ds000001"
+dataset = "ds000002"
 # main_root = f"./DATA/{dataset}" # for local
 main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}"  # for server
 output_root = f"{main_root}/derivatives/ROI_timeseries"

From 6c4c8851914bebe7dcbcb60f104f2ca3f986fbca Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 3 Jun 2024 13:00:55 -0400
Subject: [PATCH 032/401] minor fix

---
 task_dFC/FCS_estimate.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 67f2591..42bb99f 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -129,10 +129,17 @@ def run_FCS_estimate(
 
     print(f"FCS estimation CODE started running ... for task: {task} ...")
 
-    SESSIONS = dataset_info["SESSIONS"]
+    if "SESSIONS" in dataset_info:
+        SESSIONS = dataset_info["SESSIONS"]
+    else:
+        SESSIONS = None
     if SESSIONS is None:
         SESSIONS = [None]
-    RUNS = dataset_info["RUNS"]
+
+    if "RUNS" in dataset_info:
+        RUNS = dataset_info["RUNS"]
+    else:
+        RUNS = None
     if RUNS is None:
         RUNS = {task: [None]}
 

From 5cae1c9344fff477546d3c00691872e6c9f4e3f9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 3 Jun 2024 16:43:12 -0400
Subject: [PATCH 033/401] minor fix

---
 task_dFC/FCS_estimate.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 42bb99f..064988c 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -150,11 +150,6 @@ def run_FCS_estimate(
     else:
         main_root = dataset_info["main_root"]
 
-    if "{main_root}" in dataset_info["fmriprep_root"]:
-        fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
-    else:
-        fmriprep_root = dataset_info["fmriprep_root"]
-
     if "{main_root}" in dataset_info["roi_root"]:
         roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
     else:

From 725493e11146f68d267eb30558ac31f1c7ee2dcf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 3 Jun 2024 18:37:01 -0400
Subject: [PATCH 034/401] add session to dFC assess

---
 task_dFC/dFC_assessment.py | 214 +++++++++++++++++++------------------
 1 file changed, 112 insertions(+), 102 deletions(-)

diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index 84c77a1..3b858a5 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -23,121 +23,113 @@ def run_dFC_assess(
     roi_root,
     fitted_measures_root,
     output_root,
+    session=None,
+    run=None,
 ):
+    if session is None:
+        output_dir = f"{output_root}/{subj_id}"
+        fitted_measures_dir = f"{fitted_measures_root}"
+    else:
+        output_dir = f"{output_root}/{subj_id}/{session}"
+        fitted_measures_dir = f"{fitted_measures_root}/{session}"
+
+    if run is None:
+        if session is None:
+            print(f"Subject-level dFC assessment started for TASK: {task} ...")
+            input_root = f"{roi_root}/{subj_id}"
+            BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+            file_suffix = f"{task}"
+        else:
+            print(
+                f"Subject-level dFC assessment started for Session {session}, TASK: {task} ..."
+            )
+            input_root = f"{roi_root}/{subj_id}/{session}"
+            BOLD_file_name = "{subj_id}_{session}_{task}_time-series.npy"
+            file_suffix = f"{session}_{task}"
+    else:
+        if session is None:
+            print(
+                f"Subject-level dFC assessment started for TASK: {task}, RUN: {run} ..."
+            )
+            input_root = f"{roi_root}/{subj_id}"
+            BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+            file_suffix = f"{task}_{run}"
+        else:
+            print(
+                f"Subject-level dFC assessment started for Session {session}, TASK: {task}, RUN: {run} ..."
+            )
+            input_root = f"{roi_root}/{subj_id}/{session}"
+            BOLD_file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy"
+            file_suffix = f"{session}_{task}_{run}"
 
     # check if the subject has this task in roi_root
-    if not os.path.exists(f"{roi_root}/{subj_id}"):
-        print(f"Subject {subj_id} not found in {roi_root}")
+    if not os.path.exists(input_root):
+        print(f"{input_root} not found in {roi_root}")
         return
 
-    ALL_ROI_FILES = os.listdir(f"{roi_root}/{subj_id}/")
+    ALL_ROI_FILES = os.listdir(f"{input_root}/")
     ALL_ROI_FILES = [
         roi_file
         for roi_file in ALL_ROI_FILES
         if ("_time-series.npy" in roi_file) and (task in roi_file)
     ]
+    if session is not None:
+        ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (session in roi_file)]
+    if run is not None:
+        ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (run in roi_file)]
     ALL_ROI_FILES.sort()
 
     # if there are no files for this task, return
     if not len(ALL_ROI_FILES) >= 1:
-        print(f"No time series files found for {subj_id} {task}")
+        print(f"No time series files found for {subj_id} {file_suffix}")
         return
+    ################################# LOAD FIT MEASURES #################################
+
+    MA = np.load(
+        f"{fitted_measures_dir}/multi-analysis_{file_suffix}.npy",
+        allow_pickle="TRUE",
+    ).item()
+
+    ALL_RECORDS = os.listdir(f"{fitted_measures_dir}/")
+    ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)]
+    ALL_RECORDS.sort()
+    MEASURES_fit_lst = list()
+    for s in ALL_RECORDS:
+        fit_measure = np.load(f"{fitted_measures_dir}/{s}", allow_pickle="TRUE").item()
+        MEASURES_fit_lst.append(fit_measure)
+    MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
+    print("fitted MEASURES are loaded ...")
+
+    ################################# LOAD DATA #################################
+
+    BOLD = data_loader.load_TS(
+        data_root=roi_root,
+        file_name=BOLD_file_name,
+        subj_id2load=subj_id,
+        task=task,
+        session=session,
+        run=run,
+    )
 
-    # check if "_run" exists in all the task file names
-    if all(["_run" in roi_file for roi_file in ALL_ROI_FILES]):
-        # find all the runs
-        RUNS = [
-            roi_file[
-                roi_file.find("_run")
-                + 1 : roi_file.find("_run")
-                + 1
-                + roi_file[roi_file.find("_run") + 1 :].find("_")
-            ]
-            for roi_file in ALL_ROI_FILES
-        ]
-        # sort
-        RUNS.sort()
-        print(f"Found multiple runs for {subj_id} {task}: {RUNS}")
-    else:
-        RUNS = [None]
-
-    for run in RUNS:
-
-        # check if the subject has this task and run in roi_root
-        if run is None:
-            file_suffix = f"{task}"
-            if not os.path.exists(
-                f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy"
-            ):
-                print(f"Time series file not found for {subj_id} {task}")
-                continue
-            else:
-                print(
-                    f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..."
-                )
-                BOLD_file_name = "{subj_id}_{task}_time-series.npy"
-        else:
-            file_suffix = f"{task}_{run}"
-            if not os.path.exists(
-                f"{roi_root}/{subj_id}/{subj_id}_{file_suffix}_time-series.npy"
-            ):
-                print(f"Time series file not found for {subj_id} {task} {run}")
-                continue
-            else:
-                print(
-                    f"subject-level dFC assessment CODE started running ... for task {task} and {run} of subject {subj_id} ..."
-                )
-                BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
-
-        ################################# LOAD FIT MEASURES #################################
-
-        MA = np.load(
-            f"{fitted_measures_root}/multi-analysis_{file_suffix}.npy",
-            allow_pickle="TRUE",
-        ).item()
-
-        ALL_RECORDS = os.listdir(f"{fitted_measures_root}/")
-        ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)]
-        ALL_RECORDS.sort()
-        MEASURES_fit_lst = list()
-        for s in ALL_RECORDS:
-            fit_measure = np.load(
-                f"{fitted_measures_root}/{s}", allow_pickle="TRUE"
-            ).item()
-            MEASURES_fit_lst.append(fit_measure)
-        MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
-        print("fitted MEASURES are loaded ...")
-
-        ################################# LOAD DATA #################################
-
-        BOLD = data_loader.load_TS(
-            data_root=roi_root,
-            file_name=BOLD_file_name,
-            SESSIONs=task,
-            subj_id2load=subj_id,
-            task=task,
-            run=run,
-        )
-
-        ################################# dFC ASSESSMENT #################################
+    ################################# dFC ASSESSMENT #################################
 
-        tic = time.time()
-        print("Measurement Started ...")
+    tic = time.time()
+    print("Measurement Started ...")
 
-        print("dFC estimation started...")
-        dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD)
-        print("dFC estimation done.")
+    print("dFC estimation started...")
+    dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD)
+    print("dFC estimation done.")
 
-        print(f"Measurement required {time.time() - tic:0.3f} seconds.")
+    print(f"Measurement required {time.time() - tic:0.3f} seconds.")
 
-        ################################# SAVE DATA #################################
+    ################################# SAVE DATA #################################
 
-        folder = f"{output_root}/{subj_id}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
+    folder = f"{output_dir}/"
+    if not os.path.exists(folder):
+        os.makedirs(folder)
 
-        for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
-            np.save(f"{folder}/dFC_{file_suffix}_{dFC_id}.npy", dFC)
+    for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
+        np.save(f"{folder}dFC_{file_suffix}_{dFC_id}.npy", dFC)
 
 
 #######################################################################################
@@ -192,17 +184,35 @@ def run_dFC_assess(
     else:
         output_root = dataset_info["dFC_root"]
 
-    for task in TASKS:
-        run_dFC_assess(
-            subj_id=participant_id,
-            task=task,
-            roi_root=roi_root,
-            fitted_measures_root=fitted_measures_root,
-            output_root=output_root,
-        )
+    if "SESSIONS" in dataset_info:
+        SESSIONS = dataset_info["SESSIONS"]
+    else:
+        SESSIONS = None
+    if SESSIONS is None:
+        SESSIONS = [None]
+
+    if "RUNS" in dataset_info:
+        RUNS = dataset_info["RUNS"]
+    else:
+        RUNS = None
+    if RUNS is None:
+        RUNS = {task: [None] for task in TASKS}
+
+    for session in SESSIONS:
+        for task in TASKS:
+            for run in RUNS[task]:
+                run_dFC_assess(
+                    subj_id=participant_id,
+                    task=task,
+                    roi_root=roi_root,
+                    fitted_measures_root=fitted_measures_root,
+                    output_root=output_root,
+                    session=session,
+                    run=run,
+                )
 
     print(
-        f"subject-level dFC assessment CODE finished running ... for subject: {participant_id} ..."
+        f"subject-level dFC assessment CODE finished running for subject: {participant_id}"
     )
 
 #######################################################################################

From 9c412ec74299f7db350ad770e09e16335eed2be7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 7 Jun 2024 13:36:30 -0400
Subject: [PATCH 035/401] add session to ML

---
 task_dFC/ML.py | 383 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 238 insertions(+), 145 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index d70b4f9..15c575d 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -16,7 +16,7 @@
 #######################################################################################
 
 
-def find_available_subjects(dFC_root, task, run=None, dFC_id=None):
+def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None):
     """
     Find the subjects that have dFC results for the given task and dFC_id (method).
     """
@@ -25,7 +25,10 @@ def find_available_subjects(dFC_root, task, run=None, dFC_id=None):
     ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder]
     ALL_SUBJ_FOLDERS.sort()
     for subj_folder in ALL_SUBJ_FOLDERS:
-        ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
+        if session is None:
+            ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
+        else:
+            ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/")
         ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file]
         if dFC_id is not None:
             ALL_DFC_FILES = [
@@ -33,88 +36,93 @@ def find_available_subjects(dFC_root, task, run=None, dFC_id=None):
             ]
         if run is not None:
             ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if run in dFC_file]
+        if session is not None:
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if session in dFC_file
+            ]
         ALL_DFC_FILES.sort()
         if len(ALL_DFC_FILES) > 0:
             SUBJECTS.append(subj_folder)
     return SUBJECTS
 
 
-def extract_task_features(TASKS, RUNS, roi_root, output_root):
+def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, output_root):
     """
     Extract task features from the event data."""
-    task_features = {
-        "task": list(),
-        "run": list(),
-        "relative_task_on": list(),
-        "avg_task_duration": list(),
-        "var_task_duration": list(),
-        "avg_rest_duration": list(),
-        "var_rest_duration": list(),
-        "num_of_transitions": list(),
-        "relative_transition_freq": list(),
-    }
-    for task_id, task in enumerate(TASKS):
-
-        if task == "task-restingstate":
-            continue
-
-        if RUNS is None:
-            RUNS = {task: [None]}
-        for run in RUNS[task]:
-
-            SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, run=run)
-
-            for subj in SUBJECTS:
-                # event data
-                if run is None:
-                    task_data = np.load(
-                        f"{roi_root}/{subj}/{subj}_{task}_task-data.npy",
-                        allow_pickle="TRUE",
-                    ).item()
-                else:
-                    task_data = np.load(
-                        f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
-                        allow_pickle="TRUE",
-                    ).item()
-                Fs_task = task_data["Fs_task"]
-                TR_task = 1 / Fs_task
-
-                task_presence = task_utils.extract_task_presence(
-                    event_labels=task_data["event_labels"],
-                    TR_task=TR_task,
-                    TR_mri=task_data["TR_mri"],
-                    binary=True,
-                    binarizing_method="median",
-                )
+    for session in SESSIONS:
+        task_features = {
+            "task": list(),
+            "run": list(),
+            "relative_task_on": list(),
+            "avg_task_duration": list(),
+            "var_task_duration": list(),
+            "avg_rest_duration": list(),
+            "var_rest_duration": list(),
+            "num_of_transitions": list(),
+            "relative_transition_freq": list(),
+        }
+        for task_id, task in enumerate(TASKS):
 
-                relative_task_on = task_utils.relative_task_on(task_presence)
-                # task duration
-                avg_task_duration, var_task_duration = task_utils.task_duration(
-                    task_presence, task_data["TR_mri"]
-                )
-                # rest duration
-                avg_rest_duration, var_rest_duration = task_utils.rest_duration(
-                    task_presence, task_data["TR_mri"]
-                )
-                # freq of transitions
-                num_of_transitions, relative_transition_freq = task_utils.transition_freq(
-                    task_presence
-                )
+            if task == "task-restingstate":
+                continue
 
-                task_features["task"].append(task)
-                task_features["run"].append(run)
-                task_features["relative_task_on"].append(relative_task_on)
-                task_features["avg_task_duration"].append(avg_task_duration)
-                task_features["var_task_duration"].append(var_task_duration)
-                task_features["avg_rest_duration"].append(avg_rest_duration)
-                task_features["var_rest_duration"].append(var_rest_duration)
-                task_features["num_of_transitions"].append(num_of_transitions)
-                task_features["relative_transition_freq"].append(relative_transition_freq)
+            if RUNS is None:
+                RUNS = {task: [None]}
+            for run in RUNS[task]:
 
-    folder = f"{output_root}"
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    np.save(f"{folder}/task_features.npy", task_features)
+                SUBJECTS = find_available_subjects(
+                    dFC_root=dFC_root, task=task, run=run, session=session
+                )
+
+                for subj in SUBJECTS:
+                    # event data
+                    task_data = load_task_data(
+                        roi_root=roi_root, subj=subj, task=task, run=run, session=session
+                    )
+                    Fs_task = task_data["Fs_task"]
+                    TR_task = 1 / Fs_task
+
+                    task_presence = task_utils.extract_task_presence(
+                        event_labels=task_data["event_labels"],
+                        TR_task=TR_task,
+                        TR_mri=task_data["TR_mri"],
+                        binary=True,
+                        binarizing_method="mean",
+                    )
+
+                    relative_task_on = task_utils.relative_task_on(task_presence)
+                    # task duration
+                    avg_task_duration, var_task_duration = task_utils.task_duration(
+                        task_presence, task_data["TR_mri"]
+                    )
+                    # rest duration
+                    avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+                        task_presence, task_data["TR_mri"]
+                    )
+                    # freq of transitions
+                    num_of_transitions, relative_transition_freq = (
+                        task_utils.transition_freq(task_presence)
+                    )
+
+                    task_features["task"].append(task)
+                    task_features["run"].append(run)
+                    task_features["relative_task_on"].append(relative_task_on)
+                    task_features["avg_task_duration"].append(avg_task_duration)
+                    task_features["var_task_duration"].append(var_task_duration)
+                    task_features["avg_rest_duration"].append(avg_rest_duration)
+                    task_features["var_rest_duration"].append(var_rest_duration)
+                    task_features["num_of_transitions"].append(num_of_transitions)
+                    task_features["relative_transition_freq"].append(
+                        relative_transition_freq
+                    )
+
+        if session is None:
+            folder = f"{output_root}"
+        else:
+            folder = f"{output_root}/{session}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        np.save(f"{folder}/task_features.npy", task_features)
 
 
 def dFC_feature_extraction_subj_lvl(
@@ -142,7 +150,7 @@ def dFC_feature_extraction_subj_lvl(
         TR_mri=task_data["TR_mri"],
         TR_array=TR_array,
         binary=True,
-        binarizing_method="median",
+        binarizing_method="mean",
     )
 
     features = dFC_vecs
@@ -175,6 +183,63 @@ def dFC_feature_extraction_subj_lvl(
     return features, target
 
 
+def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None):
+    """
+    Load the dFC results for a given subject, task, dFC_id, run and session.
+    """
+    if session is None:
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+    else:
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+    return dFC
+
+
+def load_task_data(roi_root, subj, task, run=None, session=None):
+    """
+    Load the task data for a given subject, task and run.
+    """
+    if session is None:
+        if run is None:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+    else:
+        if run is None:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+        else:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+    return task_data
+
+
 def dFC_feature_extraction(
     task,
     train_subjects,
@@ -183,6 +248,7 @@ def dFC_feature_extraction(
     roi_root,
     dFC_root,
     run=None,
+    session=None,
     dynamic_pred="no",
     normalize_dFC=True,
 ):
@@ -191,25 +257,23 @@ def dFC_feature_extraction(
     for all subjects.
     if run is specified, dFC results for that run will be used.
     """
+    dFC_measure_name = None
     X_train = None
     y_train = None
     subj_label_train = list()
     for subj in train_subjects:
-        if run is None:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-            ).item()
-        else:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
-                allow_pickle="TRUE",
-            ).item()
+
+        dFC = load_dFC(
+            dFC_root=dFC_root,
+            subj=subj,
+            task=task,
+            dFC_id=dFC_id,
+            run=run,
+            session=session,
+        )
+        task_data = load_task_data(
+            roi_root=roi_root, subj=subj, task=task, run=run, session=session
+        )
 
         X_subj, y_subj = dFC_feature_extraction_subj_lvl(
             dFC=dFC,
@@ -226,25 +290,28 @@ def dFC_feature_extraction(
             X_train = np.concatenate((X_train, X_subj), axis=0)
             y_train = np.concatenate((y_train, y_subj), axis=0)
 
+        if dFC_measure_name is None:
+            dFC_measure_name = dFC.measure.measure_name
+        else:
+            assert (
+                dFC_measure_name == dFC.measure.measure_name
+            ), "dFC measure is not consistent."
+
     X_test = None
     y_test = None
     subj_label_test = list()
     for subj in test_subjects:
-        if run is None:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-            ).item()
-        else:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
-                allow_pickle="TRUE",
-            ).item()
+        dFC = load_dFC(
+            dFC_root=dFC_root,
+            subj=subj,
+            task=task,
+            dFC_id=dFC_id,
+            run=run,
+            session=session,
+        )
+        task_data = load_task_data(
+            roi_root=roi_root, subj=subj, task=task, run=run, session=session
+        )
 
         X_subj, y_subj = dFC_feature_extraction_subj_lvl(
             dFC=dFC,
@@ -261,6 +328,13 @@ def dFC_feature_extraction(
             X_test = np.concatenate((X_test, X_subj), axis=0)
             y_test = np.concatenate((y_test, y_subj), axis=0)
 
+        if dFC_measure_name is None:
+            dFC_measure_name = dFC.measure.measure_name
+        else:
+            assert (
+                dFC_measure_name == dFC.measure.measure_name
+            ), "dFC measure is not consistent."
+
     print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
     subj_label_train = np.array(subj_label_train)
     subj_label_test = np.array(subj_label_test)
@@ -272,7 +346,7 @@ def dFC_feature_extraction(
         y_test,
         subj_label_train,
         subj_label_test,
-        dFC.measure.measure_name,
+        dFC_measure_name,
     )
 
 
@@ -282,6 +356,7 @@ def task_presence_classification(
     roi_root,
     dFC_root,
     run=None,
+    session=None,
     dynamic_pred="no",
     normalize_dFC=True,
     train_test_ratio=0.8,
@@ -299,7 +374,7 @@ def task_presence_classification(
         return
 
     SUBJECTS = find_available_subjects(
-        dFC_root=dFC_root, task=task, run=run, dFC_id=dFC_id
+        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
     )
 
     # randomly select train_test_ratio of the subjects for training
@@ -321,6 +396,7 @@ def task_presence_classification(
             roi_root=roi_root,
             dFC_root=dFC_root,
             run=run,
+            session=session,
             dynamic_pred=dynamic_pred,
             normalize_dFC=normalize_dFC,
         )
@@ -406,51 +482,59 @@ def task_presence_classification(
 def run_classification(
     TASKS,
     RUNS,
+    SESSIONS,
     roi_root,
     dFC_root,
     output_root,
     dynamic_pred="no",
     normalize_dFC=True,
 ):
-    ML_scores = {
-        "subj_id": list(),
-        "group": list(),
-        "task": list(),
-        "run": list(),
-        "dFC method": list(),
-        "KNN accuracy": list(),
-    }
-    for dFC_id in range(0, 7):
-        print(f"=================== dFC {dFC_id} ===================")
-
-        ML_RESULT = {}
-        for task_id, task in enumerate(TASKS):
-            if RUNS is None:
-                RUNS = {task: [None]}
-            ML_RESULT[task] = {}
-            for run in RUNS[task]:
-                ML_RESULT_new, ML_scores_new = task_presence_classification(
-                    task=task,
-                    dFC_id=dFC_id,
-                    roi_root=roi_root,
-                    dFC_root=dFC_root,
-                    run=run,
-                    dynamic_pred=dynamic_pred,
-                    normalize_dFC=normalize_dFC,
-                )
-                if run is None:
-                    ML_RESULT[task] = ML_RESULT_new
-                else:
-                    ML_RESULT[task][run] = ML_RESULT_new
-                for key in ML_scores:
-                    ML_scores[key].extend(ML_scores_new[key])
-
-        folder = f"{output_root}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-        np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
-
-    np.save(f"{folder}/ML_scores_classify.npy", ML_scores)
+    for session in SESSIONS:
+        if not session is None:
+            print(f"=================== {session} ===================")
+        ML_scores = {
+            "subj_id": list(),
+            "group": list(),
+            "task": list(),
+            "run": list(),
+            "dFC method": list(),
+            "KNN accuracy": list(),
+        }
+        for dFC_id in range(0, 7):
+            print(f"=================== dFC {dFC_id} ===================")
+
+            ML_RESULT = {}
+            for task_id, task in enumerate(TASKS):
+                if RUNS is None:
+                    RUNS = {task: [None]}
+                ML_RESULT[task] = {}
+                for run in RUNS[task]:
+                    ML_RESULT_new, ML_scores_new = task_presence_classification(
+                        task=task,
+                        dFC_id=dFC_id,
+                        roi_root=roi_root,
+                        dFC_root=dFC_root,
+                        run=run,
+                        session=session,
+                        dynamic_pred=dynamic_pred,
+                        normalize_dFC=normalize_dFC,
+                    )
+                    if run is None:
+                        ML_RESULT[task] = ML_RESULT_new
+                    else:
+                        ML_RESULT[task][run] = ML_RESULT_new
+                    for key in ML_scores:
+                        ML_scores[key].extend(ML_scores_new[key])
+
+            if session is None:
+                folder = f"{output_root}"
+            else:
+                folder = f"{output_root}/{session}"
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+            np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
+
+        np.save(f"{folder}/ML_scores_classify.npy", ML_scores)
 
 
 #######################################################################################
@@ -484,6 +568,13 @@ def run_classification(
     else:
         RUNS = None
 
+    if "SESSIONS" in dataset_info:
+        SESSIONS = dataset_info["SESSIONS"]
+    else:
+        SESSIONS = None
+    if SESSIONS is None:
+        SESSIONS = [None]
+
     if "{dataset}" in dataset_info["main_root"]:
         main_root = dataset_info["main_root"].replace(
             "{dataset}", dataset_info["dataset"]
@@ -509,12 +600,14 @@ def run_classification(
     extract_task_features(
         TASKS=TASKS,
         RUNS=RUNS,
+        SESSIONS=SESSIONS,
         roi_root=roi_root,
         output_root=ML_root,
     )
     run_classification(
         TASKS=TASKS,
         RUNS=RUNS,
+        SESSIONS=SESSIONS,
         roi_root=roi_root,
         dFC_root=dFC_root,
         output_root=ML_root,

From 56b4c052c40d36b35f394158779e0d8c2379a4d9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 9 Jun 2024 21:48:24 -0400
Subject: [PATCH 036/401] minor fix

---
 task_dFC/ML.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 15c575d..e24b61a 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -66,8 +66,6 @@ def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, output_root):
             if task == "task-restingstate":
                 continue
 
-            if RUNS is None:
-                RUNS = {task: [None]}
             for run in RUNS[task]:
 
                 SUBJECTS = find_available_subjects(
@@ -505,8 +503,6 @@ def run_classification(
 
             ML_RESULT = {}
             for task_id, task in enumerate(TASKS):
-                if RUNS is None:
-                    RUNS = {task: [None]}
                 ML_RESULT[task] = {}
                 for run in RUNS[task]:
                     ML_RESULT_new, ML_scores_new = task_presence_classification(
@@ -561,12 +557,11 @@ def run_classification(
 
     TASKS = dataset_info["TASKS"]
     if "RUNS" in dataset_info:
-        if dataset_info["RUNS"] is not None:
-            RUNS = dataset_info["RUNS"]
-        else:
-            RUNS = None
+        RUNS = dataset_info["RUNS"]
     else:
         RUNS = None
+    if RUNS is None:
+        RUNS = {task: [None] for task in TASKS}
 
     if "SESSIONS" in dataset_info:
         SESSIONS = dataset_info["SESSIONS"]

From 6ccd229074b03cbcd1abc0ef96320911b24f4e4b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 11 Jun 2024 21:52:36 -0400
Subject: [PATCH 037/401] add logistic reg

---
 task_dFC/ML.py | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index e24b61a..bf50e27 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from sklearn.decomposition import PCA
+from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import balanced_accuracy_score
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KNeighborsClassifier
@@ -361,7 +362,8 @@ def task_presence_classification(
     explained_var_threshold=0.95,
 ):
     """
-    perform task presence classification using KNN for a given task and dFC method and run.
+    perform task presence classification using logistic regression and KNN
+    for a given task and dFC method and run.
     """
     if run is None:
         print(f"=============== {task} ===============")
@@ -404,6 +406,23 @@ def task_presence_classification(
 
     print("task presence classification ...")
 
+    # logistic regression
+    logistic_reg = make_pipeline(StandardScaler(), LogisticRegression())
+    # create a dictionary of all values we want to test for C
+    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+    # use gridsearch to test all values for C
+    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
+    # fit model to data
+    lr_gscv.fit(X_train, y_train)
+
+    C = lr_gscv.best_params_["C"]
+
+    log_reg = make_pipeline(
+        StandardScaler(),
+        LogisticRegression(C=C),
+    ).fit(X_train, y_train)
+
+    # KNN
     # find num_PCs
     pca = PCA(svd_solver="full", whiten=False)
     pca.fit(X_train)
@@ -434,6 +453,10 @@ def task_presence_classification(
     ).fit(X_train, y_train)
 
     ML_RESULT = {
+        "logistic regression": log_reg,
+        "logistic regression C": C,
+        "logistic regression train score": log_reg.score(X_train, y_train),
+        "logistic regression test score": log_reg.score(X_test, y_test),
         "pca": pca,
         "num_PCs": num_PCs,
         "cv_results": knn_gscv.cv_results_,
@@ -442,6 +465,12 @@ def task_presence_classification(
         "KNN test score": neigh.score(X_test, y_test),
     }
 
+    print(
+        f"Logistic regression train score {measure_name} {task}: {log_reg.score(X_train, y_train)}"
+    )
+    print(
+        f"Logistic regression test score {measure_name} {task}: {log_reg.score(X_test, y_test)}"
+    )
     print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}")
     print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}")
 
@@ -453,6 +482,7 @@ def task_presence_classification(
         "task": list(),
         "run": list(),
         "dFC method": list(),
+        "Logistic regression accuracy": list(),
         "KNN accuracy": list(),
     }
     for subj in SUBJECTS:
@@ -466,9 +496,13 @@ def task_presence_classification(
             features = X_test[subj_label_test == subj, :]
             target = y_test[subj_label_test == subj]
 
-        pred = neigh.predict(features)
+        pred_lr = log_reg.predict(features)
+        pred_KNN = neigh.predict(features)
 
-        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
+        ML_scores["Logistic regression accuracy"].append(
+            balanced_accuracy_score(target, pred_lr)
+        )
+        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
 
         ML_scores["task"].append(task)
         ML_scores["run"].append(run)
@@ -496,6 +530,7 @@ def run_classification(
             "task": list(),
             "run": list(),
             "dFC method": list(),
+            "Logistic regression accuracy": list(),
             "KNN accuracy": list(),
         }
         for dFC_id in range(0, 7):

From 7fa88fb2d3d0ca69a2f54cec94060394790476d3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 11 Jun 2024 22:34:02 -0400
Subject: [PATCH 038/401] fix file load manage

---
 task_dFC/ML.py                  | 10 ++--
 task_dFC/dFC_assessment.py      | 14 ++++--
 task_dFC/nifti_to_roi_signal.py | 87 ++++++++++++++++++---------------
 3 files changed, 65 insertions(+), 46 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index bf50e27..8171048 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -30,16 +30,20 @@ def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None)
             ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
         else:
             ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/")
-        ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file]
+        ALL_DFC_FILES = [
+            dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file
+        ]
         if dFC_id is not None:
             ALL_DFC_FILES = [
                 dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
             ]
         if run is not None:
-            ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if run in dFC_file]
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{run}_" in dFC_file
+            ]
         if session is not None:
             ALL_DFC_FILES = [
-                dFC_file for dFC_file in ALL_DFC_FILES if session in dFC_file
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{session}_" in dFC_file
             ]
         ALL_DFC_FILES.sort()
         if len(ALL_DFC_FILES) > 0:
diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index 3b858a5..06253ac 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -71,12 +71,16 @@ def run_dFC_assess(
     ALL_ROI_FILES = [
         roi_file
         for roi_file in ALL_ROI_FILES
-        if ("_time-series.npy" in roi_file) and (task in roi_file)
+        if ("_time-series.npy" in roi_file) and (f"_{task}_" in roi_file)
     ]
     if session is not None:
-        ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (session in roi_file)]
+        ALL_ROI_FILES = [
+            roi_file for roi_file in ALL_ROI_FILES if (f"_{session}_" in roi_file)
+        ]
     if run is not None:
-        ALL_ROI_FILES = [roi_file for roi_file in ALL_ROI_FILES if (run in roi_file)]
+        ALL_ROI_FILES = [
+            roi_file for roi_file in ALL_ROI_FILES if (f"_{run}_" in roi_file)
+        ]
     ALL_ROI_FILES.sort()
 
     # if there are no files for this task, return
@@ -91,7 +95,9 @@ def run_dFC_assess(
     ).item()
 
     ALL_RECORDS = os.listdir(f"{fitted_measures_dir}/")
-    ALL_RECORDS = [i for i in ALL_RECORDS if ("MEASURE" in i) and (file_suffix in i)]
+    ALL_RECORDS = [
+        i for i in ALL_RECORDS if ("MEASURE" in i) and (f"_{file_suffix}_" in i)
+    ]
     ALL_RECORDS.sort()
     MEASURES_fit_lst = list()
     for s in ALL_RECORDS:
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index fc54d9c..0d65049 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -2,6 +2,7 @@
 import json
 import os
 import warnings
+from re import A
 
 import numpy as np
 
@@ -18,57 +19,44 @@ def run_roi_signal_extraction(
     fmriprep_root,
     bold_suffix,
     output_root,
-    session="",
+    session=None,
+    RUNS=[None],
 ):
     """
     Extract ROI signals and task labels for a given subject and task
     and optionally session.
     """
+    if session is None:
+        session_str = ""
+    else:
+        session_str = session
     # find the func file for this subject and task
     try:
-        if session == "":
+        if session is None:
             ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
         else:
             ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/{session}/func/")
     except FileNotFoundError:
-        print(f"Subject {subj} {session} not found in {fmriprep_root}")
+        print(f"Subject {subj} {session_str} not found in {fmriprep_root}")
         return
 
     ALL_TASK_FILES = [
         file_i
         for file_i in ALL_TASK_FILES
-        if (bold_suffix in file_i) and (task in file_i)
+        if (bold_suffix in file_i) and (f"_{task}_" in file_i)
     ]  # only keep the denoised files? or use the original files?
 
     if not len(ALL_TASK_FILES) >= 1:
         # if the func file is not found, exclude the subject
-        print(f"Func file not found for {subj} {session} {task}")
+        print(f"Func file not found for {subj} {session_str} {task}")
         return
 
-    # there might be multiple runs for the same task
-    # check if "_run" exists in all the task file names
-    if all(["_run" in task_file for task_file in ALL_TASK_FILES]):
-        multi_run_flag = True
-        # find all the runs
-        RUNS = [
-            task_file[
-                task_file.find("_run")
-                + 1 : task_file.find("_run")
-                + 1
-                + task_file[task_file.find("_run") + 1 :].find("_")
-            ]
-            for task_file in ALL_TASK_FILES
-        ]
-        # sort
-        RUNS.sort()
-        print(f"Found multiple runs for {subj} {task}: {RUNS}")
-    else:
-        multi_run_flag = False
-        RUNS = [""]
-
     for run in RUNS:
-        task_file = [file_i for file_i in ALL_TASK_FILES if run in file_i][0]
-        if session == "":
+        if run is None:
+            task_file = ALL_TASK_FILES[0]
+        else:
+            task_file = [file_i for file_i in ALL_TASK_FILES if f"_{run}_" in file_i][0]
+        if session is None:
             nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}"
             task_events_root = f"{main_root}/bids/{subj}/func"
         else:
@@ -107,15 +95,24 @@ def run_roi_signal_extraction(
             ALL_EVENTS_FILES = [
                 file_i
                 for file_i in ALL_EVENTS_FILES
-                if (subj in file_i)
-                and (task in file_i)
-                and (run in file_i)
-                and (session in file_i)
+                if (f"{subj}_" in file_i)
+                and (f"_{task}_" in file_i)
                 and ("events.tsv" in file_i)
             ]
+            if not run is None:
+                ALL_EVENTS_FILES = [
+                    file_i for file_i in ALL_EVENTS_FILES if f"_{run}_" in file_i
+                ]
+            if not session is None:
+                ALL_EVENTS_FILES = [
+                    file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i
+                ]
             if not len(ALL_EVENTS_FILES) == 1:
                 # if the events file is not found, exclude the subject
-                print(f"Events file not found for {subj} {session} {task} {run}")
+                if run is None:
+                    print(f"Events file not found for {subj} {session_str} {task}")
+                else:
+                    print(f"Events file not found for {subj} {session_str} {task} {run}")
                 return
             # load the tsv events file
             events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
@@ -147,17 +144,17 @@ def run_roi_signal_extraction(
             "num_time_mri": num_time_mri,
         }
 
-        if session == "":
+        if session is None:
             subj_session_prefix = f"{subj}"
             output_dir = f"{output_root}/{subj}"
         else:
             subj_session_prefix = f"{subj}_{session}"
             output_dir = f"{output_root}/{subj}/{session}"
 
-        if multi_run_flag:
-            output_file_prefix = f"{subj_session_prefix}_{task}_{run}"
-        else:
+        if run is None:
             output_file_prefix = f"{subj_session_prefix}_{task}"
+        else:
+            output_file_prefix = f"{subj_session_prefix}_{task}_{run}"
 
         if not os.path.exists(f"{output_dir}/"):
             os.makedirs(f"{output_dir}/")
@@ -192,9 +189,20 @@ def run_roi_signal_extraction(
     )
 
     TASKS = dataset_info["TASKS"]
-    SESSIONS = dataset_info["SESSIONS"]
+
+    if "SESSIONS" in dataset_info:
+        SESSIONS = dataset_info["SESSIONS"]
+    else:
+        SESSIONS = None
     if SESSIONS is None:
-        SESSIONS = [""]
+        SESSIONS = [None]
+
+    if "RUNS" in dataset_info:
+        RUNS = dataset_info["RUNS"]
+    else:
+        RUNS = None
+    if RUNS is None:
+        RUNS = {task: [None] for task in TASKS}
 
     if "{dataset}" in dataset_info["main_root"]:
         main_root = dataset_info["main_root"].replace(
@@ -223,6 +231,7 @@ def run_roi_signal_extraction(
                 bold_suffix=dataset_info["bold_suffix"],
                 output_root=output_root,
                 session=session,
+                RUNS=RUNS[task],
             )
 
     print(

From 37ef96265e07ba95a861ac6b8f9b39febbac0194 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 12 Jun 2024 15:47:51 -0400
Subject: [PATCH 039/401] add clustering to ML

---
 task_dFC/ML.py | 172 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 8171048..88aa2a0 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -3,16 +3,17 @@
 import os
 
 import numpy as np
+from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import balanced_accuracy_score
+from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
 from pydfc import DFC, data_loader, task_utils
-from pydfc.dfc_utils import dFC_mat2vec, rank_norm
+from pydfc.dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm
 
 #######################################################################################
 
@@ -515,6 +516,103 @@ def task_presence_classification(
     return ML_RESULT, ML_scores
 
 
+def task_presence_clustering(
+    task,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    run=None,
+    session=None,
+    normalize_dFC=True,
+    explained_var_threshold=0.95,
+):
+    if run is None:
+        print(f"=============== {task} ===============")
+    else:
+        print(f"=============== {task} {run} ===============")
+
+    if task == "task-restingstate":
+        return
+
+    SUBJECTS = find_available_subjects(
+        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
+    )
+
+    print(f"Number of subjects: {len(SUBJECTS)}")
+
+    X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction(
+        task=task,
+        train_subjects=SUBJECTS,
+        test_subjects=[],
+        dFC_id=dFC_id,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        run=run,
+        session=session,
+        dynamic_pred="no",
+        normalize_dFC=normalize_dFC,
+    )
+
+    # clustering
+    # apply kmeans clustering with PCA to dFC features
+
+    n_clusters = 2  # corresponding to task and rest
+
+    scaler = StandardScaler()
+    X_normalized = scaler.fit_transform(X)
+    # PCA
+    # find number of components that explain 95% of variance
+    pca = PCA()
+    pca.fit(X_normalized)
+    n_components = np.where(
+        np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold
+    )[0][0]
+    pca = PCA(n_components=n_components)
+    X_pca = pca.fit_transform(X_normalized)
+    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=4)
+    labels_pred = kmeans.fit_predict(X_pca)
+
+    # ARI score
+    print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
+
+    # visualize clustering centroids
+    centroids = kmeans.cluster_centers_
+    centroids = pca.inverse_transform(centroids)
+    centroids = scaler.inverse_transform(centroids)
+    n_regions = (1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2
+    centroids_mat = dFC_vec2mat(centroids, n_regions)
+
+    clustering_RESULTS = {
+        "num_PCs": n_components,
+        "PCA": pca,
+        "kmeans": kmeans,
+        "ARI": adjusted_rand_score(y, labels_pred),
+        "centroids": centroids_mat,
+    }
+
+    clustering_scores = {
+        "subj_id": list(),
+        "task": list(),
+        "run": list(),
+        "dFC method": list(),
+        "Kmeans ARI": list(),
+    }
+    for subj in SUBJECTS:
+        clustering_scores["subj_id"].append(subj)
+        features = X[subj_label == subj, :]
+        target = y[subj_label == subj]
+
+        pred_KNN = kmeans.predict(features)
+
+        clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_KNN))
+
+        clustering_scores["task"].append(task)
+        clustering_scores["run"].append(run)
+        clustering_scores["dFC method"].append(measure_name)
+
+    return clustering_RESULTS, clustering_scores
+
+
 def run_classification(
     TASKS,
     RUNS,
@@ -572,6 +670,61 @@ def run_classification(
         np.save(f"{folder}/ML_scores_classify.npy", ML_scores)
 
 
+def run_clustering(
+    TASKS,
+    RUNS,
+    SESSIONS,
+    roi_root,
+    dFC_root,
+    output_root,
+    normalize_dFC=True,
+):
+    for session in SESSIONS:
+        if not session is None:
+            print(f"=================== {session} ===================")
+        clustering_scores = {
+            "subj_id": list(),
+            "task": list(),
+            "run": list(),
+            "dFC method": list(),
+            "Kmeans ARI": list(),
+        }
+        for dFC_id in range(0, 7):
+            print(f"=================== dFC {dFC_id} ===================")
+
+            clustering_RESULTS = {}
+            for task_id, task in enumerate(TASKS):
+                clustering_RESULTS[task] = {}
+                for run in RUNS[task]:
+                    clustering_RESULTS_new, clustering_scores_new = (
+                        task_presence_clustering(
+                            task=task,
+                            dFC_id=dFC_id,
+                            roi_root=roi_root,
+                            dFC_root=dFC_root,
+                            run=run,
+                            session=session,
+                            normalize_dFC=normalize_dFC,
+                        )
+                    )
+                    if run is None:
+                        clustering_RESULTS[task] = clustering_RESULTS_new
+                    else:
+                        clustering_RESULTS[task][run] = clustering_RESULTS_new
+                    for key in clustering_scores:
+                        clustering_scores[key].extend(clustering_scores_new[key])
+
+            if session is None:
+                folder = f"{output_root}"
+            else:
+                folder = f"{output_root}/{session}"
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+            np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS)
+
+        np.save(f"{folder}/clustering_scores.npy", clustering_scores)
+
+
 #######################################################################################
 
 if __name__ == "__main__":
@@ -638,6 +791,9 @@ def run_classification(
         roi_root=roi_root,
         output_root=ML_root,
     )
+
+    print("Task features extraction finished.")
+    print("Task presence classification started ...")
     run_classification(
         TASKS=TASKS,
         RUNS=RUNS,
@@ -648,6 +804,18 @@ def run_classification(
         dynamic_pred="no",
         normalize_dFC=True,
     )
+    print("Task presence classification finished.")
+    print("Task presence clustering started ...")
+    run_clustering(
+        TASKS=TASKS,
+        RUNS=RUNS,
+        SESSIONS=SESSIONS,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        output_root=ML_root,
+        normalize_dFC=True,
+    )
+    print("Task presence clustering finished.")
 
     print("Task presence prediction CODE finished running.")
 

From 0a0842363dd277a2aea124e83ab9975935dd6c3d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 13 Jun 2024 13:53:01 -0400
Subject: [PATCH 040/401] add bash run files

---
 task_dFC/run_scripts/dataset_info.json   | 19 +++++++++
 task_dFC/run_scripts/global_configs.json | 54 ++++++++++++++++++++++++
 task_dFC/run_scripts/methods_config.json | 35 +++++++++++++++
 task_dFC/run_scripts/run_FCS.sh          | 19 +++++++++
 task_dFC/run_scripts/run_ML.sh           | 16 +++++++
 task_dFC/run_scripts/run_dFC.sh          | 24 +++++++++++
 task_dFC/run_scripts/run_fmriprep.sh     | 28 ++++++++++++
 task_dFC/run_scripts/run_nifti_to_roi.sh | 24 +++++++++++
 8 files changed, 219 insertions(+)
 create mode 100644 task_dFC/run_scripts/dataset_info.json
 create mode 100644 task_dFC/run_scripts/global_configs.json
 create mode 100644 task_dFC/run_scripts/methods_config.json
 create mode 100644 task_dFC/run_scripts/run_FCS.sh
 create mode 100644 task_dFC/run_scripts/run_ML.sh
 create mode 100644 task_dFC/run_scripts/run_dFC.sh
 create mode 100644 task_dFC/run_scripts/run_fmriprep.sh
 create mode 100644 task_dFC/run_scripts/run_nifti_to_roi.sh

diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts/dataset_info.json
new file mode 100644
index 0000000..adfa42a
--- /dev/null
+++ b/task_dFC/run_scripts/dataset_info.json
@@ -0,0 +1,19 @@
+{
+	"dataset" : "",
+	"main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}",
+	"fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output",
+	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
+	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
+	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
+	"ML_root" : "{main_root}/derivatives/ML",
+	"bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz",
+	"SESSIONS" : [
+		"ses-1"
+	],
+	"TASKS" : [
+		"task-A"
+	],
+	"RUNS" : {
+    		"task-A": ["run-01", "run-02", "run-03", "run-04", "run-05", "run-06"]
+	}
+}
diff --git a/task_dFC/run_scripts/global_configs.json b/task_dFC/run_scripts/global_configs.json
new file mode 100644
index 0000000..ada5894
--- /dev/null
+++ b/task_dFC/run_scripts/global_configs.json
@@ -0,0 +1,54 @@
+{
+    "DATASET_NAME": "",
+    "DATASET_ROOT": "/data/origami/dFC/DATA/task-based/openneuro//",
+
+    "CONTAINER_STORE": "/data/origami/container_store/nipoppy/",
+
+    "SINGULARITY_PATH": "singularity",
+
+    "TEMPLATEFLOW_DIR": "/data/origami/templateflow",
+
+    "SESSIONS": [],
+    "VISITS": [],
+
+    "BIDS": {
+        "heudiconv": {
+            "VERSION": "0.11.6",
+            "CONTAINER": "heudiconv_{}.sif",
+            "URL": ""
+        },
+        "validator":{
+            "CONTAINER": "bids_validator.sif",
+            "URL": ""
+
+        }
+    },
+
+    "PROC_PIPELINES": {
+        "mriqc": {
+            "VERSION": "23.1.0",
+            "CONTAINER": "mriqc_{}.sif",
+            "URL": ""
+        },
+        "fmriprep": {
+            "VERSION": "23.1.3",
+            "CONTAINER": "fmriprep_{}.sif",
+            "URL": ""
+        },
+        "freesurfer": {
+            "VERSION": "7.3.2",
+            "CONTAINER": "fmriprep_{}.sif",
+            "URL": ""
+        }
+    },
+
+    "TABULAR": {
+        "data_dictionary": {
+            "PATH": "",
+            "VERSION": "",
+            "URL": ""
+        }
+    },
+
+    "WORKFLOWS": []
+}
diff --git a/task_dFC/run_scripts/methods_config.json b/task_dFC/run_scripts/methods_config.json
new file mode 100644
index 0000000..d4013d4
--- /dev/null
+++ b/task_dFC/run_scripts/methods_config.json
@@ -0,0 +1,35 @@
+{
+    "params_methods" : {
+        "W": 12,
+        "n_overlap": 1.0,
+        "sw_method": "pear_corr",
+        "tapered_window": true,
+        "TF_method": "WTC",
+        "clstr_base_measure": "SlidingWindow",
+        "hmm_iter": 20,
+        "dhmm_obs_state_ratio": 0.666,
+        "n_states": 5,
+        "n_subj_clstrs": 10,
+        "n_jobs": 2,
+        "verbose": 0,
+        "backend": "loky",
+        "normalization": true,
+        "num_subj": null,
+        "num_time_point": null
+    },
+    "MEASURES_name_lst" : [
+        "SlidingWindow",
+        "Time-Freq",
+        "CAP",
+        "ContinuousHMM",
+        "Windowless",
+        "Clustering",
+        "DiscreteHMM"
+    ],
+    "alter_hparams" : [],
+    "params_multi_analysis" : {
+        "n_jobs": null,
+        "verbose": 0,
+        "backend": "loky"
+    }
+}
diff --git a/task_dFC/run_scripts/run_FCS.sh b/task_dFC/run_scripts/run_FCS.sh
new file mode 100644
index 0000000..fb22ed5
--- /dev/null
+++ b/task_dFC/run_scripts/run_FCS.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/fcs_out.txt
+#$ -e logs/fcs_err.txt
+#$ -l h_vmem=64G
+#$ -q origami.q
+#$ -t 1-10
+
+DATASET_INFO="./dataset_info.json"
+METHODS_CONFIG="./methods_config.json"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \
+--dataset_info $DATASET_INFO \
+--methods_config $METHODS_CONFIG
+
+conda deactivate
diff --git a/task_dFC/run_scripts/run_ML.sh b/task_dFC/run_scripts/run_ML.sh
new file mode 100644
index 0000000..feaf0b0
--- /dev/null
+++ b/task_dFC/run_scripts/run_ML.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/ML_out.txt
+#$ -e logs/ML_err.txt
+#$ -l h_vmem=32G
+#$ -q origami.q
+
+DATASET_INFO="./dataset_info.json"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \
+--dataset_info $DATASET_INFO
+
+conda deactivate
diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts/run_dFC.sh
new file mode 100644
index 0000000..0683935
--- /dev/null
+++ b/task_dFC/run_scripts/run_dFC.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/dfc_out.txt
+#$ -e logs/dfc_err.txt
+#$ -l h_vmem=32G
+#$ -q origami.q
+#$ -t 1-200
+
+SUBJECT_LIST="./subj_list.txt"
+DATASET_INFO="./dataset_info.json"
+
+echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+
+SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \
+--dataset_info $DATASET_INFO \
+--participant_id $SUBJECT_ID
+
+conda deactivate
diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts/run_fmriprep.sh
new file mode 100644
index 0000000..53dc89d
--- /dev/null
+++ b/task_dFC/run_scripts/run_fmriprep.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+#$ -cwd
+#$ -o logs/fmriprep_out.log
+#$ -e logs/fmriprep_err.log
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=32G
+#$ -q origami.q
+
+#$ -t 1-122
+
+# TODO replace with local paths
+source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh"
+conda activate nipoppy_env
+
+SUBJECT_LIST="./subj_list.txt"
+GLOBAL_CONFIG="../proc/global_configs.json"
+
+echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+
+SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
+
+python "/data/origami/dFC/CODEs/nipoppy/nipoppy/workflow/proc_pipe/fmriprep/run_fmriprep.py" \
+--global_config $GLOBAL_CONFIG \
+--participant_id $SUBJECT_ID
+
+conda deactivate
diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts/run_nifti_to_roi.sh
new file mode 100644
index 0000000..5f10f08
--- /dev/null
+++ b/task_dFC/run_scripts/run_nifti_to_roi.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/roi_out.txt
+#$ -e logs/roi_err.txt
+#$ -l h_vmem=32G
+#$ -q origami.q
+#$ -t 1-200
+
+SUBJECT_LIST="./subj_list.txt"
+DATASET_INFO="./dataset_info.json"
+
+echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+
+SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
+--dataset_info $DATASET_INFO \
+--participant_id $SUBJECT_ID
+
+conda deactivate

From 33d67fca24669abbecd2eb28fcb7f8d3fb483215 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Jun 2024 14:23:24 -0400
Subject: [PATCH 041/401] minor fix

---
 task_dFC/ML.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 88aa2a0..9ac3f62 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -564,12 +564,13 @@ def task_presence_clustering(
     # find number of components that explain 95% of variance
     pca = PCA()
     pca.fit(X_normalized)
-    n_components = np.where(
-        np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold
-    )[0][0]
+    n_components = (
+        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
+        + 1
+    )
     pca = PCA(n_components=n_components)
     X_pca = pca.fit_transform(X_normalized)
-    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=4)
+    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
     labels_pred = kmeans.fit_predict(X_pca)
 
     # ARI score

From b5591919366a67b9fd4b809d5aebea4ea23c0bf6 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Jun 2024 14:34:37 -0400
Subject: [PATCH 042/401] minor change

---
 task_dFC/ML.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 9ac3f62..a8c9989 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -584,6 +584,7 @@ def task_presence_clustering(
     centroids_mat = dFC_vec2mat(centroids, n_regions)
 
     clustering_RESULTS = {
+        "StandardScaler": scaler,
         "num_PCs": n_components,
         "PCA": pca,
         "kmeans": kmeans,

From 7e87cbf4b5f555ba33cfcd939128c3be6caea0a5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 18 Jun 2024 22:38:14 -0400
Subject: [PATCH 043/401] minor fix

---
 task_dFC/ML.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index a8c9989..e5a52d6 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -414,13 +414,13 @@ def task_presence_classification(
     # logistic regression
     logistic_reg = make_pipeline(StandardScaler(), LogisticRegression())
     # create a dictionary of all values we want to test for C
-    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
     # use gridsearch to test all values for C
     lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
     # fit model to data
     lr_gscv.fit(X_train, y_train)
 
-    C = lr_gscv.best_params_["C"]
+    C = lr_gscv.best_params_["logisticregression__C"]
 
     log_reg = make_pipeline(
         StandardScaler(),

From ae98a4d1152d06c5606d85536b030bd5f2afaf88 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 16:48:49 -0400
Subject: [PATCH 044/401] add generate_report

---
 task_dFC/generate_report.py            | 477 +++++++++++++++++++++++++
 task_dFC/run_scripts/dataset_info.json |   1 +
 task_dFC/run_scripts/run_report.sh     |  16 +
 3 files changed, 494 insertions(+)
 create mode 100644 task_dFC/generate_report.py
 create mode 100644 task_dFC/run_scripts/run_report.sh

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
new file mode 100644
index 0000000..05169e4
--- /dev/null
+++ b/task_dFC/generate_report.py
@@ -0,0 +1,477 @@
+import argparse
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from nilearn import image, plotting
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+
+from pydfc import DFC, data_loader, task_utils
+from pydfc.dfc_utils import TR_intersection, dFC_mat2vec, dFC_vec2mat, rank_norm
+
+#######################################################################################
+
+
+def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None):
+    """
+    Load the dFC results for a given subject, task, dFC_id, run and session.
+    """
+    if session is None:
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+    else:
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+    return dFC
+
+
+def load_task_data(roi_root, subj, task, run=None, session=None):
+    """
+    Load the task data for a given subject, task and run.
+    """
+    if session is None:
+        if run is None:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+    else:
+        if run is None:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+        else:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+    return task_data
+
+
+# def plot_anatomical(
+#     fmriprep_root,
+#     subj,
+#     anat_suffix,
+#     session=None,
+# ):
+#     anat_suffix = '_space-MNI152NLin2009cAsym_desc-preproc_T1w.nii.gz'
+#     anat_file = f"{fmriprep_root}/{subj}/anat/{subj}{anat_suffix}"
+#     display = plotting.plot_anat(anat_file, title="plot_anat")
+
+
+# def plot_functional(
+#     fmriprep_root,
+#     subj,
+#     bold_suffix,
+#     task,
+#     session=None,
+#     run=None,
+# ):
+#     if session is None:
+#         if run is None:
+#             task_file = f"{subj}_{task}{bold_suffix}"
+#         else:
+#             task_file = f"{subj}_{task}_{run}{bold_suffix}"
+#         func_file = f"{fmriprep_root}/{subj}/func/{task_file}"
+#     else:
+#         if run is None:
+#             task_file = f"{subj}_{session}_{task}{bold_suffix}"
+#         else:
+#             task_file = f"{subj}_{session}_{task}_{run}{bold_suffix}"
+#         func_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}"
+
+#     # Compute voxel-wise mean functional image across time dimension. Now we have
+#     # functional image in 3D assigned in mean_func_img
+#     mean_func_img = image.mean_img(func_file)
+#     display = plotting.plot_anat(mean_func_img, title="plot_func")
+
+
+def plot_roi_signals(
+    roi_root,
+    subj,
+    task,
+    session=None,
+    run=None,
+):
+    if session is None:
+        if run is None:
+            file_name = "{subj_id}_{task}_time-series.npy"
+        else:
+            file_name = "{subj_id}_{task}_{run}_time-series.npy"
+    else:
+        if run is None:
+            file_name = "{subj_id}_{session}_{task}_time-series.npy"
+        else:
+            file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy"
+
+    BOLD = data_loader.load_TS(
+        data_root=roi_root,
+        file_name=file_name,
+        subj_id2load=subj,
+        task=task,
+        run=run,
+        session=session,
+    )
+
+    BOLD.visualize(nodes_lst=list(range(0, 10)), save_image=False, output_root=None)
+
+
+def plot_event_labels(
+    roi_root,
+    subj,
+    task,
+    run=None,
+    session=None,
+):
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    Fs_task = task_data["Fs_task"]
+
+    time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task
+    plt.figure(figsize=(35, 4))
+    plt.plot(time, task_data["event_labels"], linewidth=4)
+    plt.title("Event labels")
+    plt.xlabel("Time (s)")
+    plt.show()
+
+
+def plot_task_presence(
+    roi_root,
+    subj,
+    task,
+    run=None,
+    session=None,
+):
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    Fs_task = task_data["Fs_task"]
+    TR_task = 1 / Fs_task
+    TR_mri = task_data["TR_mri"]
+    Fs_mri = 1 / TR_mri
+
+    task_presence_non_binarized = task_utils.extract_task_presence(
+        event_labels=task_data["event_labels"],
+        TR_task=TR_task,
+        TR_mri=task_data["TR_mri"],
+        binary=False,
+    )
+
+    task_presence = task_utils.extract_task_presence(
+        event_labels=task_data["event_labels"],
+        TR_task=TR_task,
+        TR_mri=task_data["TR_mri"],
+        binary=True,
+        binarizing_method="mean",
+    )
+
+    time = np.arange(0, task_presence.shape[0]) / Fs_mri
+    plt.figure(figsize=(35, 4))
+    plt.plot(time, task_presence_non_binarized, linewidth=4)
+    plt.plot(time, task_presence, linewidth=4)
+    # plot mean of task presence_non_binarized as a line
+    plt.plot(time, np.mean(task_presence_non_binarized) * np.ones_like(time), linewidth=4)
+    plt.title("Task presence")
+    plt.xlabel("Time (s)")
+    plt.show()
+
+
+def calculate_subj_lvl_task_presence_characteristics(
+    roi_root,
+    subj,
+    task,
+    run=None,
+    session=None,
+):
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    Fs_task = task_data["Fs_task"]
+    TR_task = 1 / Fs_task
+
+    task_presence = task_utils.extract_task_presence(
+        event_labels=task_data["event_labels"],
+        TR_task=TR_task,
+        TR_mri=task_data["TR_mri"],
+        binary=True,
+        binarizing_method="mean",
+    )
+    relative_task_on = task_utils.relative_task_on(task_presence)
+    # task duration
+    avg_task_duration, var_task_duration = task_utils.task_duration(
+        task_presence, task_data["TR_mri"]
+    )
+    # rest duration
+    avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+        task_presence, task_data["TR_mri"]
+    )
+    # freq of transitions
+    num_of_transitions, relative_transition_freq = task_utils.transition_freq(
+        task_presence
+    )
+
+    print(f"Relative task on: {relative_task_on}")
+    print(f"Average task duration: {avg_task_duration} seconds")
+    print(f"Average rest duration: {avg_rest_duration} seconds")
+    print(f"Number of transitions: {num_of_transitions}")
+    print(f"Relative transition frequency: {relative_transition_freq}")
+
+
+def plot_FCS():
+    pass
+
+
+def plot_dFC_matrices(
+    dFC_root,
+    subj,
+    task,
+    start_time,
+    end_time,
+    run=None,
+    session=None,
+):
+    """
+    plot dFC matrices for a given subject, task, run, session, start_time and end_time
+    parameters:
+    ----------
+        dFC_root: str, path to dFC results
+        subj: str, subject id
+        task: str, task name
+        start_time: float, start time in seconds
+        end_time: float, end time in seconds
+    """
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    TR_mri = task_data["TR_mri"]
+
+    dFC_lst = list()
+    for dFC_id in range(0, 20):  # change this to the number of dFCs you have
+        try:
+            dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
+            dFC_lst.append(dFC)
+        except Exception:
+            pass
+
+    TRs = TR_intersection(dFC_lst)
+    start_TR = int(start_time / TR_mri)
+    end_TR = int(end_time / TR_mri)
+    start_TR_idx = np.where(np.array(TRs) >= start_TR)[0][0]
+    end_TR_idx = np.where(np.array(TRs) <= end_TR)[0][-1]
+    chosen_TRs = TRs[start_TR_idx:end_TR_idx]
+
+    for dFC in dFC_lst:
+        print(dFC.measure.measure_name)
+        dFC.visualize_dFC(TRs=chosen_TRs, normalize=False, rank_norm=True, fix_lim=False)
+
+
+def plot_ML_results(ML_root, output_root, task, run=None, session=None):
+    if session is None:
+        ML_scores = np.load(
+            f"{ML_root}/ML_scores_classify.npy", allow_pickle="TRUE"
+        ).item()
+    else:
+        ML_scores = np.load(
+            f"{ML_root}/{session}/ML_scores_classify.npy", allow_pickle="TRUE"
+        ).item()
+
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
+
+    sns.set_style("darkgrid")
+
+    dataframe = pd.DataFrame(ML_scores)
+    if run is not None:
+        dataframe = dataframe[dataframe["run"] == run]
+
+    g = sns.pointplot(
+        data=dataframe[dataframe["task"] == task],
+        x="dFC method",
+        y="KNN accuracy",
+        hue="group",
+        errorbar="sd",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
+    g.axhline(0.5, color="r", linestyle="--")
+    g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+
+    # save the figure
+    if session is None:
+        output_dir = f"{output_root}/group_results/classification"
+    else:
+        output_dir = f"{output_root}/group_results/classification/{session}"
+
+    if run is None:
+        f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight")
+    else:
+        f.savefig(f"{output_dir}/ML_results_classify_{run}.png", bbox_inches="tight")
+
+
+def plot_task_presence_characteristics():
+    pass
+
+
+def plot_clustering_results():
+    pass
+
+
+# def plot_dFC_clustering(
+#     dFC_root,
+#     subj,
+#     task,
+#     start_time,
+#     end_time,
+#     run=None,
+#     session=None,
+#     normalize_dFC=True,
+# ):
+#     task_data = load_task_data(roi_root, subj, task, run, session)
+#     TR_mri = task_data['TR_mri']
+
+#     dFC_lst = list()
+#     for dFC_id in range(0, 20): # change this to the number of dFCs you have
+#         try:
+#             dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
+#             dFC_lst.append(dFC)
+#         except Exception:
+#             pass
+
+#     for dFC in dFC_lst:
+#         dFC_mat = dFC.get_dFC_mat()
+#         TR_array = dFC.TR_array
+#         if normalize_dFC:
+#             dFC_mat = rank_norm(dFC_mat)
+#         dFC_vecs = dFC_mat2vec(dFC_mat)
+
+#         # apply kmeans clustering with PCA to dFC vectors
+#         n_clusters = 2
+
+#         scaler = StandardScaler()
+#         dFC_vecs = scaler.fit_transform(dFC_vecs)
+#         # PCA
+#         # find number of components that explain 95% of variance
+#         pca = PCA()
+#         pca.fit(dFC_vecs)
+#         n_components = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1
+#         # print(f"Number of components: {n_components}")
+#         pca = PCA(n_components=n_components)
+#         pca.fit(dFC_vecs)
+
+
+#         dFC_vecs_pca = pca.transform(dFC_vecs)
+#         kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=20)
+#         labels_pred = kmeans.fit_predict(dFC_vecs_pca)
+
+#         start_TR = int(start_time/TR_mri)
+#         end_TR = int(end_time/TR_mri)
+#         start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
+#         end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
+
+#         # plot labels_pred
+#         plt.figure(figsize=(35, 2))
+#         plt.plot(time[start_TR:end_TR], labels_pred[start_TR_idx:end_TR_idx], linewidth=4)
+#         # put vertical lines at the start of each TR
+#         for TR in chosen_TRs:
+#             plt.axvline(x=TR*TR_mri, color='r', linestyle='--')
+#             # plt.text(TR*TR_mri, 0.5, f"TR {TR}", fontsize=8, color='black', ha='center')
+#         plt.title(f"Cluster labels of {dFC.measure.measure_name}")
+#         plt.xlabel('Time (s)')
+#         plt.show()
+
+#######################################################################################
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to generate a report of subject results.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+
+    args = parser.parse_args()
+
+    dataset_info_file = args.dataset_info
+
+    # Read global configs
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    TASKS = dataset_info["TASKS"]
+    if "RUNS" in dataset_info:
+        RUNS = dataset_info["RUNS"]
+    else:
+        RUNS = None
+    if RUNS is None:
+        RUNS = {task: [None] for task in TASKS}
+
+    if "SESSIONS" in dataset_info:
+        SESSIONS = dataset_info["SESSIONS"]
+    else:
+        SESSIONS = None
+    if SESSIONS is None:
+        SESSIONS = [None]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        main_root = dataset_info["main_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["roi_root"]:
+        roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
+    else:
+        roi_root = dataset_info["roi_root"]
+
+    if "{main_root}" in dataset_info["dFC_root"]:
+        dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root)
+    else:
+        dFC_root = dataset_info["dFC_root"]
+
+    if "{main_root}" in dataset_info["ML_root"]:
+        ML_root = dataset_info["ML_root"].replace("{main_root}", main_root)
+    else:
+        ML_root = dataset_info["ML_root"]
+
+    if "{main_root}" in dataset_info["reports_root"]:
+        figures_root = dataset_info["reports_root"].replace("{main_root}", main_root)
+    else:
+        figures_root = dataset_info["reports_root"]
+
+    print("Generating report...")
+
+    for session in SESSIONS:
+        for task in TASKS:
+            for run in RUNS[task]:
+                plot_ML_results(
+                    ML_root=ML_root,
+                    output_root=figures_root,
+                    task=task,
+                    run=run,
+                    session=session,
+                )
+
+    print("Report generated successfully!")
+
+#######################################################################################
diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts/dataset_info.json
index adfa42a..8296d5b 100644
--- a/task_dFC/run_scripts/dataset_info.json
+++ b/task_dFC/run_scripts/dataset_info.json
@@ -6,6 +6,7 @@
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
 	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
 	"ML_root" : "{main_root}/derivatives/ML",
+	"reports_root" : "{main_root}/derivatives/reports",
 	"bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz",
 	"SESSIONS" : [
 		"ses-1"
diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts/run_report.sh
new file mode 100644
index 0000000..0b71969
--- /dev/null
+++ b/task_dFC/run_scripts/run_report.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/report_out.txt
+#$ -e logs/report_err.txt
+#$ -l h_vmem=16G
+#$ -q origami.q
+
+DATASET_INFO="./dataset_info.json"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \
+--dataset_info $DATASET_INFO
+
+conda deactivate

From e919d2d88f9c3aac7f58560d646b88ccefbca127 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 17:31:53 -0400
Subject: [PATCH 045/401] minor fix

---
 task_dFC/generate_report.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 05169e4..5aef3ed 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -321,6 +321,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
     else:
         output_dir = f"{output_root}/group_results/classification/{session}"
 
+    f = g.get_figure()
     if run is None:
         f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight")
     else:

From 811fe67caa582f68e040f31688355439521a6566 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 17:38:41 -0400
Subject: [PATCH 046/401] minor fix

---
 task_dFC/generate_report.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 5aef3ed..c377427 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -321,6 +321,9 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
     else:
         output_dir = f"{output_root}/group_results/classification/{session}"
 
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
     f = g.get_figure()
     if run is None:
         f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight")

From fc17618447a0ab9cbd46eb185530c83bc98a3027 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 17:48:35 -0400
Subject: [PATCH 047/401] minor fix

---
 task_dFC/generate_report.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index c377427..7f7f8f2 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -326,9 +326,11 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
 
     f = g.get_figure()
     if run is None:
-        f.savefig(f"{output_dir}/ML_results_classify.png", bbox_inches="tight")
+        f.savefig(f"{output_dir}/ML_results_classify_{task}.png", bbox_inches="tight")
     else:
-        f.savefig(f"{output_dir}/ML_results_classify_{run}.png", bbox_inches="tight")
+        f.savefig(
+            f"{output_dir}/ML_results_classify_{task}_{run}.png", bbox_inches="tight"
+        )
 
 
 def plot_task_presence_characteristics():

From 5c5d2b0b7b72c5a83fcf294bb1d14047f3fd8808 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 18:29:25 -0400
Subject: [PATCH 048/401] minor fix in saving figures

---
 task_dFC/generate_report.py | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 7f7f8f2..c3e6484 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -14,6 +14,14 @@
 from pydfc import DFC, data_loader, task_utils
 from pydfc.dfc_utils import TR_intersection, dFC_mat2vec, dFC_vec2mat, rank_norm
 
+################################# Parameters ####################################
+
+fig_dpi = 120
+fig_bbox_inches = "tight"
+fig_pad = 0.1
+show_title = True
+save_fig_format = "png"  # pdf, png,
+
 #######################################################################################
 
 
@@ -302,6 +310,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
     if run is not None:
         dataframe = dataframe[dataframe["run"] == run]
 
+    plt.figure(figsize=(10, 5))
     g = sns.pointplot(
         data=dataframe[dataframe["task"] == task],
         x="dFC method",
@@ -313,7 +322,8 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
         capsize=0.1,
     )
     g.axhline(0.5, color="r", linestyle="--")
-    g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+    if show_title:
+        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
 
     # save the figure
     if session is None:
@@ -324,14 +334,25 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    f = g.get_figure()
     if run is None:
-        f.savefig(f"{output_dir}/ML_results_classify_{task}.png", bbox_inches="tight")
+        plt.savefig(
+            f"{output_dir}/ML_results_classify_{task}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
     else:
-        f.savefig(
-            f"{output_dir}/ML_results_classify_{task}_{run}.png", bbox_inches="tight"
+        plt.savefig(
+            f"{output_dir}/ML_results_classify_{task}_{run}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
         )
 
+    plt.close()
+
 
 def plot_task_presence_characteristics():
     pass

From 1a959e8b4ba80d68820b0b72b575aa5b99753131 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 18:38:32 -0400
Subject: [PATCH 049/401] add logreg to report

---
 task_dFC/generate_report.py | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index c3e6484..09896eb 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -292,7 +292,20 @@ def plot_dFC_matrices(
         dFC.visualize_dFC(TRs=chosen_TRs, normalize=False, rank_norm=True, fix_lim=False)
 
 
-def plot_ML_results(ML_root, output_root, task, run=None, session=None):
+def plot_ML_results(
+    ML_root, output_root, task, run=None, session=None, ML_algorithm="KNN"
+):
+    """
+    Plot the ML results for a given task, run and session.
+    parameters:
+    ----------
+        ML_root: str, path to ML results
+        output_root: str, path to save the figures
+        task: str, task name
+        run: int, run number
+        session: str, session name
+        ML_algorithm: str, ML algorithm name (default: KNN, other options: Logistic regression)
+    """
     if session is None:
         ML_scores = np.load(
             f"{ML_root}/ML_scores_classify.npy", allow_pickle="TRUE"
@@ -314,7 +327,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
     g = sns.pointplot(
         data=dataframe[dataframe["task"] == task],
         x="dFC method",
-        y="KNN accuracy",
+        y=f"{ML_algorithm} accuracy",
         hue="group",
         errorbar="sd",
         linestyle="none",
@@ -334,9 +347,14 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
+    if ML_algorithm == "Logistic regression":
+        ML_algorithm_name = "LogReg"
+    elif ML_algorithm == "KNN":
+        ML_algorithm_name = "KNN"
+
     if run is None:
         plt.savefig(
-            f"{output_dir}/ML_results_classify_{task}.{save_fig_format}",
+            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -344,7 +362,7 @@ def plot_ML_results(ML_root, output_root, task, run=None, session=None):
         )
     else:
         plt.savefig(
-            f"{output_dir}/ML_results_classify_{task}_{run}.{save_fig_format}",
+            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -497,6 +515,15 @@ def plot_clustering_results():
                     task=task,
                     run=run,
                     session=session,
+                    ML_algorithm="KNN",
+                )
+                plot_ML_results(
+                    ML_root=ML_root,
+                    output_root=figures_root,
+                    task=task,
+                    run=run,
+                    session=session,
+                    ML_algorithm="Logistic regression",
                 )
 
     print("Report generated successfully!")

From d10787682b391a6ccdd3a29a02a7d66037275998 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 18:40:43 -0400
Subject: [PATCH 050/401] minor fix

---
 task_dFC/generate_report.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 09896eb..8ce31b9 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -517,14 +517,14 @@ def plot_clustering_results():
                     session=session,
                     ML_algorithm="KNN",
                 )
-                plot_ML_results(
-                    ML_root=ML_root,
-                    output_root=figures_root,
-                    task=task,
-                    run=run,
-                    session=session,
-                    ML_algorithm="Logistic regression",
-                )
+                # plot_ML_results(
+                #     ML_root=ML_root,
+                #     output_root=figures_root,
+                #     task=task,
+                #     run=run,
+                #     session=session,
+                #     ML_algorithm="Logistic regression",
+                # )
 
     print("Report generated successfully!")
 

From e76001a976e5d43b3f46fd847667b0f2c09b784a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 20:41:31 -0400
Subject: [PATCH 051/401] add dFC to reports

---
 task_dFC/generate_report.py        | 50 +++++++++++++++++++++++++-----
 task_dFC/run_scripts/run_report.sh |  4 ++-
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 8ce31b9..3942542 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -256,6 +256,7 @@ def plot_dFC_matrices(
     task,
     start_time,
     end_time,
+    output_root,
     run=None,
     session=None,
 ):
@@ -287,9 +288,23 @@ def plot_dFC_matrices(
     end_TR_idx = np.where(np.array(TRs) <= end_TR)[0][-1]
     chosen_TRs = TRs[start_TR_idx:end_TR_idx]
 
+    output_dir = f"{output_root}/subject_results/{subj}/dFC_matrices"
+    if session is not None:
+        output_dir = f"{output_dir}/{session}"
+    output_dir = f"{output_dir}/{task}"
+    if run is not None:
+        output_dir = f"{output_dir}/{run}"
+    output_dir = f"{output_dir}/"
+
     for dFC in dFC_lst:
-        print(dFC.measure.measure_name)
-        dFC.visualize_dFC(TRs=chosen_TRs, normalize=False, rank_norm=True, fix_lim=False)
+        dFC.visualize_dFC(
+            TRs=chosen_TRs,
+            normalize=False,
+            rank_norm=True,
+            fix_lim=False,
+            save_image=True,
+            output_root=output_dir,
+        )
 
 
 def plot_ML_results(
@@ -453,15 +468,21 @@ def plot_clustering_results():
     parser = argparse.ArgumentParser(description=HELPTEXT)
 
     parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+    parser.add_argument("--subj_list", type=str, help="path to subject list file")
 
     args = parser.parse_args()
 
     dataset_info_file = args.dataset_info
+    subj_list_file = args.subj_list
 
-    # Read global configs
+    # Read dataset info
     with open(dataset_info_file, "r") as f:
         dataset_info = json.load(f)
 
+    # Read subject list file, a txt file with one subject id per line
+    with open(subj_list_file, "r") as f:
+        SUBJECTS = f.read().splitlines()
+
     TASKS = dataset_info["TASKS"]
     if "RUNS" in dataset_info:
         RUNS = dataset_info["RUNS"]
@@ -500,18 +521,33 @@ def plot_clustering_results():
         ML_root = dataset_info["ML_root"]
 
     if "{main_root}" in dataset_info["reports_root"]:
-        figures_root = dataset_info["reports_root"].replace("{main_root}", main_root)
+        reports_root = dataset_info["reports_root"].replace("{main_root}", main_root)
     else:
-        figures_root = dataset_info["reports_root"]
+        reports_root = dataset_info["reports_root"]
 
     print("Generating report...")
 
+    for subj in SUBJECTS:
+        for session in SESSIONS:
+            for task in TASKS:
+                for run in RUNS[task]:
+                    plot_dFC_matrices(
+                        dFC_root=dFC_root,
+                        subj=subj,
+                        task=task,
+                        start_time=50,
+                        end_time=150,
+                        output_root=reports_root,
+                        run=run,
+                        session=session,
+                    )
+
     for session in SESSIONS:
         for task in TASKS:
             for run in RUNS[task]:
                 plot_ML_results(
                     ML_root=ML_root,
-                    output_root=figures_root,
+                    output_root=reports_root,
                     task=task,
                     run=run,
                     session=session,
@@ -519,7 +555,7 @@ def plot_clustering_results():
                 )
                 # plot_ML_results(
                 #     ML_root=ML_root,
-                #     output_root=figures_root,
+                #     output_root=reports_root,
                 #     task=task,
                 #     run=run,
                 #     session=session,
diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts/run_report.sh
index 0b71969..1734316 100644
--- a/task_dFC/run_scripts/run_report.sh
+++ b/task_dFC/run_scripts/run_report.sh
@@ -7,10 +7,12 @@
 #$ -q origami.q
 
 DATASET_INFO="./dataset_info.json"
+SUBJ_LIST="./subj_list.txt"
 
 source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
 conda activate pydfc
 python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \
---dataset_info $DATASET_INFO
+--dataset_info $DATASET_INFO, \
+--subj_list $SUBJ_LIST
 
 conda deactivate

From 2c2dd957bc76810193359df84783f45499494157 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 20:49:36 -0400
Subject: [PATCH 052/401] minor bug

---
 task_dFC/run_scripts/run_report.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts/run_report.sh
index 1734316..2a00cc5 100644
--- a/task_dFC/run_scripts/run_report.sh
+++ b/task_dFC/run_scripts/run_report.sh
@@ -12,7 +12,7 @@ SUBJ_LIST="./subj_list.txt"
 source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
 conda activate pydfc
 python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \
---dataset_info $DATASET_INFO, \
+--dataset_info $DATASET_INFO \
 --subj_list $SUBJ_LIST
 
 conda deactivate

From 5a20c2d9e3cec7599955018c77f03720e92d5e7e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 23:15:21 -0400
Subject: [PATCH 053/401] add visuals to report

---
 task_dFC/generate_report.py | 163 ++++++++++++++++++++++++++++++++----
 1 file changed, 148 insertions(+), 15 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 3942542..0770aef 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -124,6 +124,10 @@ def plot_roi_signals(
     roi_root,
     subj,
     task,
+    start_time,
+    end_time,
+    output_root,
+    nodes_list=range(0, 10),
     session=None,
     run=None,
 ):
@@ -138,6 +142,9 @@ def plot_roi_signals(
         else:
             file_name = "{subj_id}_{session}_{task}_{run}_time-series.npy"
 
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    TR_mri = task_data["TR_mri"]
+
     BOLD = data_loader.load_TS(
         data_root=roi_root,
         file_name=file_name,
@@ -147,13 +154,47 @@ def plot_roi_signals(
         session=session,
     )
 
-    BOLD.visualize(nodes_lst=list(range(0, 10)), save_image=False, output_root=None)
+    time = np.arange(0, BOLD.data.shape[1]) * TR_mri
+    start_time = 200
+    end_time = 300
+    start_TR = int(start_time / TR_mri)
+    end_TR = int(end_time / TR_mri)
+    fig_width = (start_time - end_time) / 5
+    plt.figure(figsize=(fig_width, 3))
+    for i in nodes_list:
+        plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
+    if show_title:
+        plt.title("ROI signals")
+    plt.xlabel("Time (s)")
+
+    # save the figure
+    output_dir = f"{output_root}/subject_results/{subj}/ROI_signals"
+    if session is not None:
+        output_dir = f"{output_dir}/{session}"
+    output_dir = f"{output_dir}/{task}"
+    if run is not None:
+        output_dir = f"{output_dir}/{run}"
+    output_dir = f"{output_dir}/"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    plt.savefig(
+        f"{output_dir}/ROI_signals.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
 
 
 def plot_event_labels(
     roi_root,
     subj,
     task,
+    output_root,
     run=None,
     session=None,
 ):
@@ -165,13 +206,35 @@ def plot_event_labels(
     plt.plot(time, task_data["event_labels"], linewidth=4)
     plt.title("Event labels")
     plt.xlabel("Time (s)")
-    plt.show()
+
+    # save the figure
+    output_dir = f"{output_root}/subject_results/{subj}/event_labels"
+    if session is not None:
+        output_dir = f"{output_dir}/{session}"
+    output_dir = f"{output_dir}/{task}"
+    if run is not None:
+        output_dir = f"{output_dir}/{run}"
+    output_dir = f"{output_dir}/"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    plt.savefig(
+        f"{output_dir}/event_labels.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
 
 
 def plot_task_presence(
     roi_root,
     subj,
     task,
+    output_root,
     run=None,
     session=None,
 ):
@@ -204,7 +267,28 @@ def plot_task_presence(
     plt.plot(time, np.mean(task_presence_non_binarized) * np.ones_like(time), linewidth=4)
     plt.title("Task presence")
     plt.xlabel("Time (s)")
-    plt.show()
+
+    # save the figure
+    output_dir = f"{output_root}/subject_results/{subj}/task_presence"
+    if session is not None:
+        output_dir = f"{output_dir}/{session}"
+    output_dir = f"{output_dir}/{task}"
+    if run is not None:
+        output_dir = f"{output_dir}/{run}"
+    output_dir = f"{output_dir}/"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    plt.savefig(
+        f"{output_dir}/task_presence.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
 
 
 def calculate_subj_lvl_task_presence_characteristics(
@@ -246,8 +330,14 @@ def calculate_subj_lvl_task_presence_characteristics(
     print(f"Relative transition frequency: {relative_transition_freq}")
 
 
-def plot_FCS():
-    pass
+# def plot_FCS():
+#     visualize_FCS(
+#         measure,
+#         normalize=True,
+#         fix_lim=False,
+#         save_image=save_image,
+#         output_root=output_root + "FCS/",
+#     )
 
 
 def plot_dFC_matrices(
@@ -531,16 +621,59 @@ def plot_clustering_results():
         for session in SESSIONS:
             for task in TASKS:
                 for run in RUNS[task]:
-                    plot_dFC_matrices(
-                        dFC_root=dFC_root,
-                        subj=subj,
-                        task=task,
-                        start_time=50,
-                        end_time=150,
-                        output_root=reports_root,
-                        run=run,
-                        session=session,
-                    )
+
+                    try:
+                        plot_dFC_matrices(
+                            dFC_root=dFC_root,
+                            subj=subj,
+                            task=task,
+                            start_time=50,
+                            end_time=150,
+                            output_root=reports_root,
+                            run=run,
+                            session=session,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting dFC matrices: {e}")
+
+                    try:
+                        plot_roi_signals(
+                            roi_root=roi_root,
+                            subj=subj,
+                            task=task,
+                            start_time=50,
+                            end_time=150,
+                            nodes_list=range(0, 10),
+                            output_root=reports_root,
+                            run=run,
+                            session=session,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting ROI signals: {e}")
+
+                    try:
+                        plot_event_labels(
+                            roi_root=roi_root,
+                            subj=subj,
+                            task=task,
+                            output_root=reports_root,
+                            run=run,
+                            session=session,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting event labels: {e}")
+
+                    try:
+                        plot_task_presence(
+                            roi_root=roi_root,
+                            subj=subj,
+                            task=task,
+                            output_root=reports_root,
+                            run=run,
+                            session=session,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting task presence: {e}")
 
     for session in SESSIONS:
         for task in TASKS:

From 243976ad70f084b696b05676b89bdf5ebef69414 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 23:23:52 -0400
Subject: [PATCH 054/401] minor fix

---
 task_dFC/ML.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index e5a52d6..e275f7d 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -339,7 +339,7 @@ def dFC_feature_extraction(
                 dFC_measure_name == dFC.measure.measure_name
             ), "dFC measure is not consistent."
 
-    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
+    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
     subj_label_train = np.array(subj_label_train)
     subj_label_test = np.array(subj_label_test)
 
@@ -743,7 +743,7 @@ def run_clustering(
 
     dataset_info_file = args.dataset_info
 
-    # Read global configs
+    # Read dataset info
     with open(dataset_info_file, "r") as f:
         dataset_info = json.load(f)
 

From 7ca2d4daa0983441a0da4acb0e5826956b993c25 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Jun 2024 23:27:58 -0400
Subject: [PATCH 055/401] test

---
 task_dFC/ML.py                 | 3 +++
 task_dFC/run_scripts/run_ML.sh | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index e275f7d..c495d2e 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import os
+from re import S
 
 import numpy as np
 from sklearn.cluster import KMeans
@@ -381,6 +382,7 @@ def task_presence_classification(
     SUBJECTS = find_available_subjects(
         dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
     )
+    SUBJECTS = SUBJECTS[:20]
 
     # randomly select train_test_ratio of the subjects for training
     # and rest for testing using numpy.random.choice
@@ -537,6 +539,7 @@ def task_presence_clustering(
     SUBJECTS = find_available_subjects(
         dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
     )
+    SUBJECTS = SUBJECTS[:20]
 
     print(f"Number of subjects: {len(SUBJECTS)}")
 
diff --git a/task_dFC/run_scripts/run_ML.sh b/task_dFC/run_scripts/run_ML.sh
index feaf0b0..4ec431a 100644
--- a/task_dFC/run_scripts/run_ML.sh
+++ b/task_dFC/run_scripts/run_ML.sh
@@ -3,7 +3,7 @@
 #$ -cwd
 #$ -o logs/ML_out.txt
 #$ -e logs/ML_err.txt
-#$ -l h_vmem=32G
+#$ -l h_vmem=64G
 #$ -q origami.q
 
 DATASET_INFO="./dataset_info.json"

From c36a03fd94557921f6a2abcc0395b960b0b1c185 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 11:50:10 -0400
Subject: [PATCH 056/401] minor fix

---
 task_dFC/ML.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index c495d2e..7af39ce 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -583,7 +583,7 @@ def task_presence_clustering(
     centroids = kmeans.cluster_centers_
     centroids = pca.inverse_transform(centroids)
     centroids = scaler.inverse_transform(centroids)
-    n_regions = (1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2
+    n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
     centroids_mat = dFC_vec2mat(centroids, n_regions)
 
     clustering_RESULTS = {

From e11a1f06cf7a7b508bca74634c03493458de5fc4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 12:04:17 -0400
Subject: [PATCH 057/401] minor fix

---
 task_dFC/generate_report.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 0770aef..c6827b9 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -159,7 +159,7 @@ def plot_roi_signals(
     end_time = 300
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
-    fig_width = (start_time - end_time) / 5
+    fig_width = (end_time - start_time) / 5
     plt.figure(figsize=(fig_width, 3))
     for i in nodes_list:
         plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
@@ -617,6 +617,9 @@ def plot_clustering_results():
 
     print("Generating report...")
 
+    # Generate report only for 5 random subjects
+    SUBJECTS = np.random.choice(SUBJECTS, 5)
+
     for subj in SUBJECTS:
         for session in SESSIONS:
             for task in TASKS:

From fe1184b3159a9b910d90e3eb83aef33d32e9bc65 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 16:01:38 -0400
Subject: [PATCH 058/401] create html report

---
 task_dFC/generate_report.py | 64 +++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index c6827b9..9cbcb9a 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -548,6 +548,61 @@ def plot_clustering_results():
 #         plt.xlabel('Time (s)')
 #         plt.show()
 
+
+def create_html_report(
+    subj,
+    reports_root,
+):
+    """
+    This function creates an html report for the subject results
+    using the generated figures.
+    """
+    # create html report
+    subj_dir = f"{reports_root}/subject_results/{subj}"
+    file = open(f"{subj_dir}/report.html", "w")
+    file.write("<html>\n")
+    file.write("<head>\n")
+    file.write("<title>Subject results</title>\n")
+    file.write("</head>\n")
+    file.write("<body>\n")
+    file.write("<h1>Subject results</h1>\n")
+    for session in SESSIONS:
+        if session is not None:
+            file.write(f"<h2> {session} </h2>\n")
+        for task in TASKS:
+            file.write(f"<h2> {task} </h2>\n")
+            for run in RUNS[task]:
+                if run is not None:
+                    file.write(f"<h2> {run} </h2>\n")
+                session_task_run_dir = f"{subj_dir}"
+                if session is not None:
+                    session_task_run_dir = f"{session_task_run_dir}/{session}"
+                session_task_run_dir = f"{session_task_run_dir}/{task}"
+                if run is not None:
+                    session_task_run_dir = f"{session_task_run_dir}/{run}"
+
+                file.write(
+                    f"<img src='{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png' alt='ROI signals'>\n"
+                )
+                file.write(
+                    f"<img src='{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png' alt='Event labels'>\n"
+                )
+                file.write(
+                    f"<img src='{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png' alt='Task presence'>\n"
+                )
+                # for dFC matrices find all png files in the directory
+                dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}"
+                if os.path.exists(dFC_matrices_dir):
+                    for file_name in os.listdir(dFC_matrices_dir):
+                        if file_name.endswith(".png"):
+                            file.write(
+                                f"<img src='{dFC_matrices_dir}/{file_name}' alt='{file_name}'>\n"
+                            )
+    file.write("</body>\n")
+    file.write("</html>\n")
+    file.close()
+
+
 #######################################################################################
 if __name__ == "__main__":
     # argparse
@@ -617,8 +672,8 @@ def plot_clustering_results():
 
     print("Generating report...")
 
-    # Generate report only for 5 random subjects
-    SUBJECTS = np.random.choice(SUBJECTS, 5)
+    # Generate report only one random subjects
+    SUBJECTS = np.random.choice(SUBJECTS, 1)
 
     for subj in SUBJECTS:
         for session in SESSIONS:
@@ -677,6 +732,11 @@ def plot_clustering_results():
                         )
                     except Exception as e:
                         print(f"Error in plotting task presence: {e}")
+        # create html report
+        try:
+            create_html_report(subj, reports_root)
+        except Exception as e:
+            print(f"Error in creating html report: {e}")
 
     for session in SESSIONS:
         for task in TASKS:

From 41978448ceb6a223eba68a6e80136929039eadd4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 18:09:49 -0400
Subject: [PATCH 059/401] minor bug in ML

---
 task_dFC/ML.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 7af39ce..3ff4dec 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -607,9 +607,11 @@ def task_presence_clustering(
         features = X[subj_label == subj, :]
         target = y[subj_label == subj]
 
-        pred_KNN = kmeans.predict(features)
+        features_normalized = scaler.transform(features)
+        features_pca = pca.transform(features_normalized)
+        pred_kmeans = kmeans.predict(features_pca)
 
-        clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_KNN))
+        clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans))
 
         clustering_scores["task"].append(task)
         clustering_scores["run"].append(run)

From a3dcf8b41d96bb5e151762f5f8971696f5607f93 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 18:26:11 -0400
Subject: [PATCH 060/401] adjust fig size in report

---
 task_dFC/generate_report.py | 91 ++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 21 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 9cbcb9a..f4d155c 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -155,14 +155,11 @@ def plot_roi_signals(
     )
 
     time = np.arange(0, BOLD.data.shape[1]) * TR_mri
-    start_time = 200
-    end_time = 300
-    start_TR = int(start_time / TR_mri)
-    end_TR = int(end_time / TR_mri)
-    fig_width = (end_time - start_time) / 5
-    plt.figure(figsize=(fig_width, 3))
+    # keep the figure width proportional to the number of time points in data
+    fig_width = int(2.5 * task_data["num_time_mri"])
+    plt.figure(figsize=(fig_width, 5))
     for i in nodes_list:
-        plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
+        plt.plot(time, BOLD.data[i, :], linewidth=4)
     if show_title:
         plt.title("ROI signals")
     plt.xlabel("Time (s)")
@@ -202,7 +199,9 @@ def plot_event_labels(
     Fs_task = task_data["Fs_task"]
 
     time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task
-    plt.figure(figsize=(35, 4))
+    # keep the figure width proportional to the number of time points in data
+    fig_width = int(2.5 * task_data["num_time_mri"])
+    plt.figure(figsize=(fig_width, 5))
     plt.plot(time, task_data["event_labels"], linewidth=4)
     plt.title("Event labels")
     plt.xlabel("Time (s)")
@@ -260,7 +259,9 @@ def plot_task_presence(
     )
 
     time = np.arange(0, task_presence.shape[0]) / Fs_mri
-    plt.figure(figsize=(35, 4))
+    # keep the figure width proportional to the number of time points in data
+    fig_width = int(2.5 * task_data["num_time_mri"])
+    plt.figure(figsize=(fig_width, 5))
     plt.plot(time, task_presence_non_binarized, linewidth=4)
     plt.plot(time, task_presence, linewidth=4)
     # plot mean of task presence_non_binarized as a line
@@ -551,53 +552,95 @@ def plot_clustering_results():
 
 def create_html_report(
     subj,
+    SESSIONS,
+    TASKS,
+    RUNS,
     reports_root,
 ):
     """
     This function creates an html report for the subject results
     using the generated figures.
     """
+    img_height = 150
     # create html report
     subj_dir = f"{reports_root}/subject_results/{subj}"
     file = open(f"{subj_dir}/report.html", "w")
     file.write("<html>\n")
     file.write("<head>\n")
-    file.write("<title>Subject results</title>\n")
+    file.write(f"<title>Subject {subj} Results</title>\n")
     file.write("</head>\n")
     file.write("<body>\n")
-    file.write("<h1>Subject results</h1>\n")
+    file.write(f"<h1>Subject {subj} Results</h1>\n")
     for session in SESSIONS:
         if session is not None:
-            file.write(f"<h2> {session} </h2>\n")
+            file.write(f"<h1> {session} </h1>\n")
         for task in TASKS:
-            file.write(f"<h2> {task} </h2>\n")
+            file.write(f"<h1> {task} </h1>\n")
             for run in RUNS[task]:
                 if run is not None:
                     file.write(f"<h2> {run} </h2>\n")
-                session_task_run_dir = f"{subj_dir}"
                 if session is not None:
-                    session_task_run_dir = f"{session_task_run_dir}/{session}"
-                session_task_run_dir = f"{session_task_run_dir}/{task}"
+                    session_task_run_dir = f"{session}/{task}"
+                else:
+                    session_task_run_dir = f"{task}"
                 if run is not None:
                     session_task_run_dir = f"{session_task_run_dir}/{run}"
 
+                # display ROI signals
+                ROI_signals_img = (
+                    f"{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png"
+                )
+                img = plt.imread(ROI_signals_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
                 file.write(
-                    f"<img src='{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png' alt='ROI signals'>\n"
+                    f"<img src='{ROI_signals_img}' alt='ROI signals' width='{width}' height='{img_height}'>\n"
+                )
+                file.write("<br>\n")
+
+                # display event labels
+                event_labels_img = (
+                    f"{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png"
                 )
+                img = plt.imread(event_labels_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
                 file.write(
-                    f"<img src='{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png' alt='Event labels'>\n"
+                    f"<img src='{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png' alt='Event labels' width='{width}' height='{img_height}'>\n"
+                )
+                file.write("<br>\n")
+
+                # display task presence
+                task_presence_img = (
+                    f"{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png"
                 )
+                img = plt.imread(task_presence_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
                 file.write(
-                    f"<img src='{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png' alt='Task presence'>\n"
+                    f"<img src='{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png' alt='Task presence' width='{width}' height='{img_height}'>\n"
                 )
+                file.write("<br>\n")
+
+                # display dFC matrices
                 # for dFC matrices find all png files in the directory
                 dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}"
                 if os.path.exists(dFC_matrices_dir):
                     for file_name in os.listdir(dFC_matrices_dir):
                         if file_name.endswith(".png"):
+                            file.write(f"<h3>{file_name[:file_name.find('_dFC')]}</h3>\n")
+                            # get the original size of the image
+                            img = plt.imread(f"{dFC_matrices_dir}/{file_name}")
+                            height, width, _ = img.shape
+                            # change the width so that height equals img_height
+                            width = int(width * img_height / height)
                             file.write(
-                                f"<img src='{dFC_matrices_dir}/{file_name}' alt='{file_name}'>\n"
+                                f"<img src='{dFC_matrices_dir}/{file_name}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
                             )
+                            file.write("<br>\n")
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -734,7 +777,13 @@ def create_html_report(
                         print(f"Error in plotting task presence: {e}")
         # create html report
         try:
-            create_html_report(subj, reports_root)
+            create_html_report(
+                subj=subj,
+                SESSIONS=SESSIONS,
+                TASKS=TASKS,
+                RUNS=RUNS,
+                reports_root=reports_root,
+            )
         except Exception as e:
             print(f"Error in creating html report: {e}")
 

From b34867bbd5d48ccd8a1ebb8cc8d789d98f8a09bf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 19:55:17 -0400
Subject: [PATCH 061/401] set time interval for report

---
 task_dFC/generate_report.py | 70 +++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 14 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index f4d155c..77e316a 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import os
+from tracemalloc import start
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -155,11 +156,19 @@ def plot_roi_signals(
     )
 
     time = np.arange(0, BOLD.data.shape[1]) * TR_mri
-    # keep the figure width proportional to the number of time points in data
-    fig_width = int(2.5 * task_data["num_time_mri"])
+    start_TR = int(start_time / TR_mri)
+    end_TR = int(end_time / TR_mri)
+    # keep the figure width proportional to the number of time points
+    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     for i in nodes_list:
-        plt.plot(time, BOLD.data[i, :], linewidth=4)
+        plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
+    # put vertical lines at the start of each TR
+    for TR in range(start_TR, end_TR):
+        plt.axvline(x=TR * TR_mri, color="r", linestyle="--")
+    # show TR labels on the red lines with a small font and at the top
+    for TR in range(start_TR, end_TR):
+        plt.text(TR * TR_mri, 1.2, f"TR {TR}", fontsize=8, color="black", ha="center")
     if show_title:
         plt.title("ROI signals")
     plt.xlabel("Time (s)")
@@ -191,18 +200,28 @@ def plot_event_labels(
     roi_root,
     subj,
     task,
+    start_time,
+    end_time,
     output_root,
     run=None,
     session=None,
 ):
     task_data = load_task_data(roi_root, subj, task, run, session)
     Fs_task = task_data["Fs_task"]
+    TR_task = 1 / Fs_task
+    TR_mri = task_data["TR_mri"]
 
     time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task
-    # keep the figure width proportional to the number of time points in data
-    fig_width = int(2.5 * task_data["num_time_mri"])
+    start_timepoint = int(start_time / TR_task)
+    end_timepoint = int(end_time / TR_task)
+    # keep the figure width proportional to the number of time points
+    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
-    plt.plot(time, task_data["event_labels"], linewidth=4)
+    plt.plot(
+        time[start_timepoint:end_timepoint],
+        task_data["event_labels"][start_timepoint:end_timepoint],
+        linewidth=4,
+    )
     plt.title("Event labels")
     plt.xlabel("Time (s)")
 
@@ -233,6 +252,8 @@ def plot_task_presence(
     roi_root,
     subj,
     task,
+    start_time,
+    end_time,
     output_root,
     run=None,
     session=None,
@@ -259,13 +280,27 @@ def plot_task_presence(
     )
 
     time = np.arange(0, task_presence.shape[0]) / Fs_mri
+    start_TR = int(start_time / TR_mri)
+    end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points in data
-    fig_width = int(2.5 * task_data["num_time_mri"])
+    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
-    plt.plot(time, task_presence_non_binarized, linewidth=4)
-    plt.plot(time, task_presence, linewidth=4)
+    plt.plot(
+        time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
+    )
+    plt.plot(time[start_TR:end_TR], task_presence[start_TR:end_TR], linewidth=4)
     # plot mean of task presence_non_binarized as a line
-    plt.plot(time, np.mean(task_presence_non_binarized) * np.ones_like(time), linewidth=4)
+    plt.plot(
+        time[start_TR:end_TR],
+        np.mean(task_presence_non_binarized) * np.ones_like(time[start_TR:end_TR]),
+        linewidth=4,
+    )
+    # put vertical lines at the start of each TR
+    for TR in range(start_TR, end_TR):
+        plt.axvline(x=TR * TR_mri, color="r", linestyle="--")
+    # show TR labels on the red lines with a small font and at the top
+    for TR in range(start_TR, end_TR):
+        plt.text(TR * TR_mri, 1.2, f"TR {TR}", fontsize=8, color="black", ha="center")
     plt.title("Task presence")
     plt.xlabel("Time (s)")
 
@@ -718,6 +753,9 @@ def create_html_report(
     # Generate report only one random subjects
     SUBJECTS = np.random.choice(SUBJECTS, 1)
 
+    start_time = 0
+    end_time = 200
+
     for subj in SUBJECTS:
         for session in SESSIONS:
             for task in TASKS:
@@ -728,8 +766,8 @@ def create_html_report(
                             dFC_root=dFC_root,
                             subj=subj,
                             task=task,
-                            start_time=50,
-                            end_time=150,
+                            start_time=start_time,
+                            end_time=end_time,
                             output_root=reports_root,
                             run=run,
                             session=session,
@@ -742,8 +780,8 @@ def create_html_report(
                             roi_root=roi_root,
                             subj=subj,
                             task=task,
-                            start_time=50,
-                            end_time=150,
+                            start_time=start_time,
+                            end_time=end_time,
                             nodes_list=range(0, 10),
                             output_root=reports_root,
                             run=run,
@@ -757,6 +795,8 @@ def create_html_report(
                             roi_root=roi_root,
                             subj=subj,
                             task=task,
+                            start_time=start_time,
+                            end_time=end_time,
                             output_root=reports_root,
                             run=run,
                             session=session,
@@ -769,6 +809,8 @@ def create_html_report(
                             roi_root=roi_root,
                             subj=subj,
                             task=task,
+                            start_time=start_time,
+                            end_time=end_time,
                             output_root=reports_root,
                             run=run,
                             session=session,

From 71b96b0a30889a7f91aba34047825c73d78e1f6f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 21:35:14 -0400
Subject: [PATCH 062/401] change fig size report

---
 task_dFC/generate_report.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 77e316a..0e46f56 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1,7 +1,6 @@
 import argparse
 import json
 import os
-from tracemalloc import start
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -159,7 +158,7 @@ def plot_roi_signals(
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points
-    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = int(5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     for i in nodes_list:
         plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
@@ -215,7 +214,7 @@ def plot_event_labels(
     start_timepoint = int(start_time / TR_task)
     end_timepoint = int(end_time / TR_task)
     # keep the figure width proportional to the number of time points
-    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = int(5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
         time[start_timepoint:end_timepoint],
@@ -283,7 +282,7 @@ def plot_task_presence(
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points in data
-    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = int(5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
         time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
@@ -750,8 +749,9 @@ def create_html_report(
 
     print("Generating report...")
 
-    # Generate report only one random subjects
-    SUBJECTS = np.random.choice(SUBJECTS, 1)
+    # Generate report only 5 random subjects
+    # SUBJECTS = np.random.choice(SUBJECTS, 5)
+    SUBJECTS = SUBJECTS[:1]
 
     start_time = 0
     end_time = 200

From e8997b7ca275cc08354f706d7becfd3b86f3c607 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 22:34:48 -0400
Subject: [PATCH 063/401] minor change

---
 task_dFC/generate_report.py              | 11 +++++++----
 task_dFC/run_scripts/run_dFC.sh          |  2 +-
 task_dFC/run_scripts/run_fmriprep.sh     |  2 +-
 task_dFC/run_scripts/run_nifti_to_roi.sh |  2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 0e46f56..3078f0f 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -158,7 +158,7 @@ def plot_roi_signals(
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points
-    fig_width = int(5 * (end_time - start_time) / TR_mri)
+    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     for i in nodes_list:
         plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
@@ -214,7 +214,7 @@ def plot_event_labels(
     start_timepoint = int(start_time / TR_task)
     end_timepoint = int(end_time / TR_task)
     # keep the figure width proportional to the number of time points
-    fig_width = int(5 * (end_time - start_time) / TR_mri)
+    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
         time[start_timepoint:end_timepoint],
@@ -282,7 +282,7 @@ def plot_task_presence(
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points in data
-    fig_width = int(5 * (end_time - start_time) / TR_mri)
+    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
         time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
@@ -595,7 +595,7 @@ def create_html_report(
     This function creates an html report for the subject results
     using the generated figures.
     """
-    img_height = 150
+
     # create html report
     subj_dir = f"{reports_root}/subject_results/{subj}"
     file = open(f"{subj_dir}/report.html", "w")
@@ -620,6 +620,8 @@ def create_html_report(
                 if run is not None:
                     session_task_run_dir = f"{session_task_run_dir}/{run}"
 
+                img_height = 100
+
                 # display ROI signals
                 ROI_signals_img = (
                     f"{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png"
@@ -660,6 +662,7 @@ def create_html_report(
                 file.write("<br>\n")
 
                 # display dFC matrices
+                img_height = 50
                 # for dFC matrices find all png files in the directory
                 dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}"
                 if os.path.exists(dFC_matrices_dir):
diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts/run_dFC.sh
index 0683935..684dbea 100644
--- a/task_dFC/run_scripts/run_dFC.sh
+++ b/task_dFC/run_scripts/run_dFC.sh
@@ -5,7 +5,7 @@
 #$ -e logs/dfc_err.txt
 #$ -l h_vmem=32G
 #$ -q origami.q
-#$ -t 1-200
+#$ -t 1-300
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts/run_fmriprep.sh
index 53dc89d..ea3c357 100644
--- a/task_dFC/run_scripts/run_fmriprep.sh
+++ b/task_dFC/run_scripts/run_fmriprep.sh
@@ -7,7 +7,7 @@
 #$ -l h_vmem=32G
 #$ -q origami.q
 
-#$ -t 1-122
+#$ -t 1-300
 
 # TODO replace with local paths
 source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh"
diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts/run_nifti_to_roi.sh
index 5f10f08..9af79f7 100644
--- a/task_dFC/run_scripts/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts/run_nifti_to_roi.sh
@@ -5,7 +5,7 @@
 #$ -e logs/roi_err.txt
 #$ -l h_vmem=32G
 #$ -q origami.q
-#$ -t 1-200
+#$ -t 1-300
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"

From 067331656598acac569139d1a1c9f3dfb5e8d4df Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 21 Jun 2024 23:28:17 -0400
Subject: [PATCH 064/401] fix path in report

---
 task_dFC/generate_report.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 3078f0f..c53b4bd 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -630,6 +630,8 @@ def create_html_report(
                 height, width, _ = img.shape
                 # change the width so that height equals img_height
                 width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                ROI_signals_img = ROI_signals_img.replace(subj_dir, ".")
                 file.write(
                     f"<img src='{ROI_signals_img}' alt='ROI signals' width='{width}' height='{img_height}'>\n"
                 )
@@ -643,8 +645,10 @@ def create_html_report(
                 height, width, _ = img.shape
                 # change the width so that height equals img_height
                 width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                event_labels_img = event_labels_img.replace(subj_dir, ".")
                 file.write(
-                    f"<img src='{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png' alt='Event labels' width='{width}' height='{img_height}'>\n"
+                    f"<img src='{event_labels_img}' alt='Event labels' width='{width}' height='{img_height}'>\n"
                 )
                 file.write("<br>\n")
 
@@ -656,26 +660,31 @@ def create_html_report(
                 height, width, _ = img.shape
                 # change the width so that height equals img_height
                 width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                task_presence_img = task_presence_img.replace(subj_dir, ".")
                 file.write(
-                    f"<img src='{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png' alt='Task presence' width='{width}' height='{img_height}'>\n"
+                    f"<img src='{task_presence_img}' alt='Task presence' width='{width}' height='{img_height}'>\n"
                 )
                 file.write("<br>\n")
 
                 # display dFC matrices
-                img_height = 50
+                img_height = 45
                 # for dFC matrices find all png files in the directory
                 dFC_matrices_dir = f"{subj_dir}/dFC_matrices/{session_task_run_dir}"
                 if os.path.exists(dFC_matrices_dir):
                     for file_name in os.listdir(dFC_matrices_dir):
                         if file_name.endswith(".png"):
                             file.write(f"<h3>{file_name[:file_name.find('_dFC')]}</h3>\n")
+                            dFC_matrices_img = f"{dFC_matrices_dir}/{file_name}"
                             # get the original size of the image
-                            img = plt.imread(f"{dFC_matrices_dir}/{file_name}")
+                            img = plt.imread(dFC_matrices_img)
                             height, width, _ = img.shape
                             # change the width so that height equals img_height
                             width = int(width * img_height / height)
+                            # replace the path to the image with a relative path
+                            dFC_matrices_img = dFC_matrices_img.replace(subj_dir, ".")
                             file.write(
-                                f"<img src='{dFC_matrices_dir}/{file_name}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
+                                f"<img src='{dFC_matrices_img}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
                             )
                             file.write("<br>\n")
     file.write("</body>\n")

From ec9966bf3b5ae659b9af096760c68f7bcdc93ba8 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 22 Jun 2024 00:40:02 -0400
Subject: [PATCH 065/401] add task features to report

---
 task_dFC/generate_report.py | 240 +++++++++++++++++++++++++++++++++---
 1 file changed, 225 insertions(+), 15 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index c53b4bd..ecd6fa3 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -464,11 +464,13 @@ def plot_ML_results(
         dataframe = dataframe[dataframe["run"] == run]
 
     plt.figure(figsize=(10, 5))
+
     g = sns.pointplot(
         data=dataframe[dataframe["task"] == task],
         x="dFC method",
         y=f"{ML_algorithm} accuracy",
         hue="group",
+        hue_order=["train", "test"],
         errorbar="sd",
         linestyle="none",
         dodge=True,
@@ -512,10 +514,6 @@ def plot_ML_results(
     plt.close()
 
 
-def plot_task_presence_characteristics():
-    pass
-
-
 def plot_clustering_results():
     pass
 
@@ -584,7 +582,76 @@ def plot_clustering_results():
 #         plt.show()
 
 
-def create_html_report(
+def plot_task_presence_features(
+    ML_root,
+    output_root,
+    session=None,
+    run=None,
+):
+    """
+    Plot the task presence features for a given session and run.
+    for comparability of tasks, pass the same run number for all tasks
+    parameters:
+    ----------
+        ML_root: str, path to ML results
+        output_root: str, path to save the figures
+        session: str, session name
+        run: int, run number
+    """
+    if session is None:
+        task_features = np.load(
+            f"{ML_root}/task_features.npy", allow_pickle="TRUE"
+        ).item()
+    else:
+        task_features = np.load(
+            f"{ML_root}/{session}/task_features.npy", allow_pickle="TRUE"
+        ).item()
+
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
+
+    sns.set_style("darkgrid")
+
+    dataframe = pd.DataFrame(task_features)
+    if run is not None:
+        dataframe = dataframe[dataframe["run"] == run]
+
+    # FEATURES are columns in the dataframe except for 'task' and 'run'
+    FEATURES = list(dataframe.columns)
+    FEATURES.remove("task")
+    FEATURES.remove("run")
+
+    for i, feature in enumerate(FEATURES):
+        plt.figure(figsize=(10, 5))
+        sns.pointplot(
+            data=dataframe,
+            x="task",
+            y=feature,
+            errorbar="sd",
+            linestyle="none",
+            dodge=True,
+            capsize=0.1,
+        )
+        # save the figure
+        if session is None:
+            output_dir = f"{output_root}/group_results/task_presence_features"
+        else:
+            output_dir = f"{output_root}/group_results/task_presence_features/{session}"
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        plt.savefig(
+            f"{output_dir}/task_presence_features_{feature}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+
+        plt.close()
+
+
+def create_html_report_subj_results(
     subj,
     SESSIONS,
     TASKS,
@@ -692,6 +759,111 @@ def create_html_report(
     file.close()
 
 
+def create_html_report_group_results(
+    SESSIONS,
+    TASKS,
+    RUNS,
+    reports_root,
+):
+    """
+    This function creates an html report for the group results
+    using the generated figures.
+    """
+    # create html report
+    group_dir = f"{reports_root}/group_results"
+    file = open(f"{group_dir}/report.html", "w")
+    file.write("<html>\n")
+    file.write("<head>\n")
+    file.write("<title>Group Results</title>\n")
+    file.write("</head>\n")
+    file.write("<body>\n")
+    file.write("<h1>Group Results</h1>\n")
+
+    # task presence features
+    img_height = 300
+    file.write("<h1>Task presence features</h1>\n")
+    for session in SESSIONS:
+        if session is not None:
+            file.write(f"<h1> {session} </h1>\n")
+        # display task presence features
+        if session is not None:
+            task_presence_features_dir = f"{group_dir}/task_presence_features/{session}"
+        else:
+            task_presence_features_dir = f"{group_dir}/task_presence_features"
+        # find all png files in the directory
+        for file_name in os.listdir(task_presence_features_dir):
+            if file_name.endswith(".png"):
+                file.write(f"<h3>{file_name[:file_name.find('_task')]}</h3>\n")
+                task_presence_features_img = f"{task_presence_features_dir}/{file_name}"
+                # get the original size of the image
+                img = plt.imread(task_presence_features_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                task_presence_features_img = task_presence_features_img.replace(
+                    group_dir, "."
+                )
+                file.write(
+                    f"<img src='{task_presence_features_img}' alt='Task presence features' width='{width}' height='{img_height}'>\n"
+                )
+
+    # classification results
+    img_height = 300
+    file.write("<h1>Classification results</h1>\n")
+    for session in SESSIONS:
+        if session is not None:
+            file.write(f"<h1> {session} </h1>\n")
+        for task in TASKS:
+            file.write(f"<h1> {task} </h1>\n")
+            for run in RUNS[task]:
+                # if run is not None:
+                #     file.write(f"<h2> {run} </h2>\n")
+                if session is not None:
+                    classification_dir = f"{group_dir}/classification/{session}"
+                else:
+                    classification_dir = f"{group_dir}/classification"
+
+                # display KNN classification results
+                if run is None:
+                    classification_img = (
+                        f"{classification_dir}/ML_results_classify_KNN_{task}.png"
+                    )
+                else:
+                    classification_img = (
+                        f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png"
+                    )
+                img = plt.imread(classification_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                classification_img = classification_img.replace(group_dir, ".")
+                file.write(
+                    f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
+                )
+
+                # # display Logistic regression classification results
+                # if run is None:
+                #     classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}.png"
+                # else:
+                #     classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png"
+                # img = plt.imread(classification_img)
+                # height, width, _ = img.shape
+                # # change the width so that height equals img_height
+                # width = int(width * img_height / height)
+                # # replace the path to the image with a relative path
+                # classification_img = classification_img.replace(group_dir, ".")
+                # file.write(
+                #     f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
+                # )
+
+            file.write("<br>\n")
+    file.write("</body>\n")
+    file.write("</html>\n")
+    file.close()
+
+
 #######################################################################################
 if __name__ == "__main__":
     # argparse
@@ -831,7 +1003,7 @@ def create_html_report(
                         print(f"Error in plotting task presence: {e}")
         # create html report
         try:
-            create_html_report(
+            create_html_report_subj_results(
                 subj=subj,
                 SESSIONS=SESSIONS,
                 TASKS=TASKS,
@@ -839,19 +1011,46 @@ def create_html_report(
                 reports_root=reports_root,
             )
         except Exception as e:
-            print(f"Error in creating html report: {e}")
+            print(f"Error in creating html report for subject results: {e}")
+
+    # find the common run number for all tasks for task presence features
+    common_run = None
+    for task in TASKS:
+        if common_run is None:
+            common_run = RUNS[task][0]
+        else:
+            if RUNS[task][0] != common_run:
+                common_run = None
+                # raise warning
+                print(
+                    "Warning: Tasks have different run numbers for task presence features!"
+                )
+                break
 
     for session in SESSIONS:
+        try:
+            plot_task_presence_features(
+                ML_root=ML_root,
+                output_root=reports_root,
+                session=session,
+                run=common_run,
+            )
+        except Exception as e:
+            print(f"Error in plotting task presence features: {e}")
+
         for task in TASKS:
             for run in RUNS[task]:
-                plot_ML_results(
-                    ML_root=ML_root,
-                    output_root=reports_root,
-                    task=task,
-                    run=run,
-                    session=session,
-                    ML_algorithm="KNN",
-                )
+                try:
+                    plot_ML_results(
+                        ML_root=ML_root,
+                        output_root=reports_root,
+                        task=task,
+                        run=run,
+                        session=session,
+                        ML_algorithm="KNN",
+                    )
+                except Exception as e:
+                    print(f"Error in plotting ML results for KNN: {e}")
                 # plot_ML_results(
                 #     ML_root=ML_root,
                 #     output_root=reports_root,
@@ -861,6 +1060,17 @@ def create_html_report(
                 #     ML_algorithm="Logistic regression",
                 # )
 
+    # create html report
+    try:
+        create_html_report_group_results(
+            SESSIONS=SESSIONS,
+            TASKS=TASKS,
+            RUNS=RUNS,
+            reports_root=reports_root,
+        )
+    except Exception as e:
+        print(f"Error in creating html report for group results: {e}")
+
     print("Report generated successfully!")
 
 #######################################################################################

From e280de1fa630c7d2bc5bfb3541f691e14f1b8f5c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 24 Jun 2024 12:19:41 -0400
Subject: [PATCH 066/401] minor change

---
 task_dFC/ML.py              | 3 ---
 task_dFC/generate_report.py | 5 ++---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 3ff4dec..b81f3a2 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -1,7 +1,6 @@
 import argparse
 import json
 import os
-from re import S
 
 import numpy as np
 from sklearn.cluster import KMeans
@@ -382,7 +381,6 @@ def task_presence_classification(
     SUBJECTS = find_available_subjects(
         dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
     )
-    SUBJECTS = SUBJECTS[:20]
 
     # randomly select train_test_ratio of the subjects for training
     # and rest for testing using numpy.random.choice
@@ -539,7 +537,6 @@ def task_presence_clustering(
     SUBJECTS = find_available_subjects(
         dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
     )
-    SUBJECTS = SUBJECTS[:20]
 
     print(f"Number of subjects: {len(SUBJECTS)}")
 
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index ecd6fa3..1310745 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -781,7 +781,7 @@ def create_html_report_group_results(
 
     # task presence features
     img_height = 300
-    file.write("<h1>Task presence features</h1>\n")
+    file.write("<h1>Task Presence Features</h1>\n")
     for session in SESSIONS:
         if session is not None:
             file.write(f"<h1> {session} </h1>\n")
@@ -793,7 +793,6 @@ def create_html_report_group_results(
         # find all png files in the directory
         for file_name in os.listdir(task_presence_features_dir):
             if file_name.endswith(".png"):
-                file.write(f"<h3>{file_name[:file_name.find('_task')]}</h3>\n")
                 task_presence_features_img = f"{task_presence_features_dir}/{file_name}"
                 # get the original size of the image
                 img = plt.imread(task_presence_features_img)
@@ -810,7 +809,7 @@ def create_html_report_group_results(
 
     # classification results
     img_height = 300
-    file.write("<h1>Classification results</h1>\n")
+    file.write("<h1>Classification Results</h1>\n")
     for session in SESSIONS:
         if session is not None:
             file.write(f"<h1> {session} </h1>\n")

From dcbce16c9573a2012c565c351f495c07c195a932 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 24 Jun 2024 16:12:44 -0400
Subject: [PATCH 067/401] remove -t from run scripts

---
 task_dFC/run_scripts/run_FCS.sh          | 1 -
 task_dFC/run_scripts/run_dFC.sh          | 1 -
 task_dFC/run_scripts/run_fmriprep.sh     | 2 --
 task_dFC/run_scripts/run_nifti_to_roi.sh | 1 -
 4 files changed, 5 deletions(-)

diff --git a/task_dFC/run_scripts/run_FCS.sh b/task_dFC/run_scripts/run_FCS.sh
index fb22ed5..a84c578 100644
--- a/task_dFC/run_scripts/run_FCS.sh
+++ b/task_dFC/run_scripts/run_FCS.sh
@@ -5,7 +5,6 @@
 #$ -e logs/fcs_err.txt
 #$ -l h_vmem=64G
 #$ -q origami.q
-#$ -t 1-10
 
 DATASET_INFO="./dataset_info.json"
 METHODS_CONFIG="./methods_config.json"
diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts/run_dFC.sh
index 684dbea..124dc1f 100644
--- a/task_dFC/run_scripts/run_dFC.sh
+++ b/task_dFC/run_scripts/run_dFC.sh
@@ -5,7 +5,6 @@
 #$ -e logs/dfc_err.txt
 #$ -l h_vmem=32G
 #$ -q origami.q
-#$ -t 1-300
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts/run_fmriprep.sh
index ea3c357..ada2813 100644
--- a/task_dFC/run_scripts/run_fmriprep.sh
+++ b/task_dFC/run_scripts/run_fmriprep.sh
@@ -7,8 +7,6 @@
 #$ -l h_vmem=32G
 #$ -q origami.q
 
-#$ -t 1-300
-
 # TODO replace with local paths
 source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh"
 conda activate nipoppy_env
diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts/run_nifti_to_roi.sh
index 9af79f7..1fff1da 100644
--- a/task_dFC/run_scripts/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts/run_nifti_to_roi.sh
@@ -5,7 +5,6 @@
 #$ -e logs/roi_err.txt
 #$ -l h_vmem=32G
 #$ -q origami.q
-#$ -t 1-300
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"

From b5a3dfbacfb9975cc4555247ae997d582754aed5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 2 Jul 2024 12:57:57 -0400
Subject: [PATCH 068/401] add dFC clustering to report

---
 task_dFC/generate_report.py | 366 +++++++++++++++++++++++++++---------
 1 file changed, 276 insertions(+), 90 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 1310745..7fd231c 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -514,72 +514,168 @@ def plot_ML_results(
     plt.close()
 
 
-def plot_clustering_results():
-    pass
+def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
+    """
+    Plot the clustering results for a given task, run and session.
+    parameters:
+    ----------
+        ML_root: str, path to ML results
+        output_root: str, path to save the figures
+        task: str, task name
+        run: int, run number
+        session: str, session name
+    """
+    if session is None:
+        clustering_scores = np.load(
+            f"{ML_root}/clustering_scores.npy", allow_pickle="TRUE"
+        ).item()
+    else:
+        clustering_scores = np.load(
+            f"{ML_root}/{session}/clustering_scores.npy", allow_pickle="TRUE"
+        ).item()
 
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
 
-# def plot_dFC_clustering(
-#     dFC_root,
-#     subj,
-#     task,
-#     start_time,
-#     end_time,
-#     run=None,
-#     session=None,
-#     normalize_dFC=True,
-# ):
-#     task_data = load_task_data(roi_root, subj, task, run, session)
-#     TR_mri = task_data['TR_mri']
-
-#     dFC_lst = list()
-#     for dFC_id in range(0, 20): # change this to the number of dFCs you have
-#         try:
-#             dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
-#             dFC_lst.append(dFC)
-#         except Exception:
-#             pass
-
-#     for dFC in dFC_lst:
-#         dFC_mat = dFC.get_dFC_mat()
-#         TR_array = dFC.TR_array
-#         if normalize_dFC:
-#             dFC_mat = rank_norm(dFC_mat)
-#         dFC_vecs = dFC_mat2vec(dFC_mat)
-
-#         # apply kmeans clustering with PCA to dFC vectors
-#         n_clusters = 2
-
-#         scaler = StandardScaler()
-#         dFC_vecs = scaler.fit_transform(dFC_vecs)
-#         # PCA
-#         # find number of components that explain 95% of variance
-#         pca = PCA()
-#         pca.fit(dFC_vecs)
-#         n_components = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.95)[0][0] + 1
-#         # print(f"Number of components: {n_components}")
-#         pca = PCA(n_components=n_components)
-#         pca.fit(dFC_vecs)
-
-
-#         dFC_vecs_pca = pca.transform(dFC_vecs)
-#         kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=20)
-#         labels_pred = kmeans.fit_predict(dFC_vecs_pca)
-
-#         start_TR = int(start_time/TR_mri)
-#         end_TR = int(end_time/TR_mri)
-#         start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
-#         end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
-
-#         # plot labels_pred
-#         plt.figure(figsize=(35, 2))
-#         plt.plot(time[start_TR:end_TR], labels_pred[start_TR_idx:end_TR_idx], linewidth=4)
-#         # put vertical lines at the start of each TR
-#         for TR in chosen_TRs:
-#             plt.axvline(x=TR*TR_mri, color='r', linestyle='--')
-#             # plt.text(TR*TR_mri, 0.5, f"TR {TR}", fontsize=8, color='black', ha='center')
-#         plt.title(f"Cluster labels of {dFC.measure.measure_name}")
-#         plt.xlabel('Time (s)')
-#         plt.show()
+    sns.set_style("darkgrid")
+
+    dataframe = pd.DataFrame(clustering_scores)
+    if run is not None:
+        dataframe = dataframe[dataframe["run"] == run]
+
+    plt.figure(figsize=(10, 5))
+    g = sns.pointplot(
+        data=dataframe[dataframe["task"] == task],
+        x="dFC method",
+        y="Kmeans ARI",
+        errorbar="sd",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
+    g.axhline(0.0, color="r", linestyle="--")
+    if show_title:
+        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+
+    # save the figure
+    if session is None:
+        output_dir = f"{output_root}/group_results/clustering"
+    else:
+        output_dir = f"{output_root}/group_results/clustering/{session}"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if run is None:
+        plt.savefig(
+            f"{output_dir}/clustering_results_{task}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+    else:
+        plt.savefig(
+            f"{output_dir}/clustering_results_{task}_{run}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+
+    plt.close()
+
+
+def plot_dFC_clustering(
+    dFC_root,
+    subj,
+    task,
+    start_time,
+    end_time,
+    output_root,
+    run=None,
+    session=None,
+    normalize_dFC=True,
+):
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    TR_mri = task_data["TR_mri"]
+
+    for dFC_id in range(
+        0, 20
+    ):  # change this to the number of dFCs you have or right a function that finds available dFC ids
+        try:
+            dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
+        except Exception:
+            pass
+
+        dFC_mat = dFC.get_dFC_mat()
+        TR_array = dFC.TR_array
+        if normalize_dFC:
+            dFC_mat = rank_norm(dFC_mat)
+        dFC_vecs = dFC_mat2vec(dFC_mat)
+
+        if session is None:
+            clustering_RESULTS = np.load(
+                f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            clustering_RESULTS = np.load(
+                f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+        if run is None:
+            scaler = clustering_RESULTS[task]["StandardScaler"]
+            pca = clustering_RESULTS[task]["PCA"]
+            kmeans = clustering_RESULTS[task]["kmeans"]
+        else:
+            scaler = clustering_RESULTS[task][run]["StandardScaler"]
+            pca = clustering_RESULTS[task][run]["PCA"]
+            kmeans = clustering_RESULTS[task][run]["kmeans"]
+
+        dFC_vecs_normalized = scaler.transform(dFC_vecs)
+        dFC_vecs_pca = pca.transform(dFC_vecs_normalized)
+        cluster_labels = kmeans.predict(dFC_vecs_pca)
+
+        start_TR = int(start_time / TR_mri)
+        end_TR = int(end_time / TR_mri)
+
+        start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
+        end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
+
+        fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+        plt.figure(figsize=(fig_width, 5))
+        time = TR_array[start_TR_idx:end_TR_idx] * TR_mri
+        plt.plot(
+            time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4
+        )
+        # put vertical lines at the start of each TR
+        for t in time:
+            plt.axvline(x=t, color="r", linestyle="--")
+            # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center')
+        plt.title(f"Cluster labels of {dFC.measure.measure_name}")
+        plt.xlabel("Time (s)")
+
+        # save the figure
+        output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering"
+        if session is not None:
+            output_dir = f"{output_dir}/{session}"
+        output_dir = f"{output_dir}/{task}"
+        if run is not None:
+            output_dir = f"{output_dir}/{run}"
+        output_dir = f"{output_dir}/"
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        plt.savefig(
+            f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+
+        plt.close()
 
 
 def plot_task_presence_features(
@@ -754,6 +850,29 @@ def create_html_report_subj_results(
                                 f"<img src='{dFC_matrices_img}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
                             )
                             file.write("<br>\n")
+
+                # display dFC clustering
+                img_height = 100
+                # for dFC matrices find all png files in the directory
+                dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}"
+                if os.path.exists(dFC_clustering_dir):
+                    for file_name in os.listdir(dFC_clustering_dir):
+                        if file_name.endswith(".png"):
+                            file.write(
+                                f"<h3>{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}</h3>\n"
+                            )
+                            dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}"
+                            # get the original size of the image
+                            img = plt.imread(dFC_clustering_img)
+                            height, width, _ = img.shape
+                            # change the width so that height equals img_height
+                            width = int(width * img_height / height)
+                            # replace the path to the image with a relative path
+                            dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".")
+                            file.write(
+                                f"<img src='{dFC_clustering_img}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
+                            )
+                            file.write("<br>\n")
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -816,14 +935,15 @@ def create_html_report_group_results(
         for task in TASKS:
             file.write(f"<h1> {task} </h1>\n")
             for run in RUNS[task]:
-                # if run is not None:
-                #     file.write(f"<h2> {run} </h2>\n")
+                if run is not None:
+                    file.write(f"<h2> {run} </h2>\n")
                 if session is not None:
                     classification_dir = f"{group_dir}/classification/{session}"
                 else:
                     classification_dir = f"{group_dir}/classification"
 
                 # display KNN classification results
+                file.write("<h3>KNN</h3>\n")
                 if run is None:
                     classification_img = (
                         f"{classification_dir}/ML_results_classify_KNN_{task}.png"
@@ -842,22 +962,60 @@ def create_html_report_group_results(
                     f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
                 )
 
-                # # display Logistic regression classification results
-                # if run is None:
-                #     classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}.png"
-                # else:
-                #     classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png"
-                # img = plt.imread(classification_img)
-                # height, width, _ = img.shape
-                # # change the width so that height equals img_height
-                # width = int(width * img_height / height)
-                # # replace the path to the image with a relative path
-                # classification_img = classification_img.replace(group_dir, ".")
-                # file.write(
-                #     f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                # )
-
-            file.write("<br>\n")
+                # display Logistic regression classification results
+                file.write("<h3>Logistic Regression</h3>\n")
+                if run is None:
+                    classification_img = (
+                        f"{classification_dir}/ML_results_classify_LogReg_{task}.png"
+                    )
+                else:
+                    classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png"
+                img = plt.imread(classification_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                classification_img = classification_img.replace(group_dir, ".")
+                file.write(
+                    f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
+                )
+
+                file.write("<br>\n")
+
+    # clustering results
+    img_height = 300
+    file.write("<h1>Clustering Results</h1>\n")
+    for session in SESSIONS:
+        if session is not None:
+            file.write(f"<h1> {session} </h1>\n")
+        for task in TASKS:
+            file.write(f"<h1> {task} </h1>\n")
+            for run in RUNS[task]:
+                if run is not None:
+                    file.write(f"<h2> {run} </h2>\n")
+                if session is not None:
+                    clustering_dir = f"{group_dir}/clustering/{session}"
+                else:
+                    clustering_dir = f"{group_dir}/clustering"
+
+                # display clustering results
+                if run is None:
+                    clustering_img = f"{clustering_dir}/clustering_results_{task}.png"
+                else:
+                    clustering_img = (
+                        f"{clustering_dir}/clustering_results_{task}_{run}.png"
+                    )
+                img = plt.imread(clustering_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                clustering_img = clustering_img.replace(group_dir, ".")
+                file.write(
+                    f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
+                )
+
+                file.write("<br>\n")
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -1000,6 +1158,21 @@ def create_html_report_group_results(
                         )
                     except Exception as e:
                         print(f"Error in plotting task presence: {e}")
+
+                    try:
+                        plot_dFC_clustering(
+                            dFC_root=dFC_root,
+                            subj=subj,
+                            task=task,
+                            start_time=start_time,
+                            end_time=end_time,
+                            output_root=reports_root,
+                            run=run,
+                            session=session,
+                            normalize_dFC=True,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting dFC clustering: {e}")
         # create html report
         try:
             create_html_report_subj_results(
@@ -1050,14 +1223,27 @@ def create_html_report_group_results(
                     )
                 except Exception as e:
                     print(f"Error in plotting ML results for KNN: {e}")
-                # plot_ML_results(
-                #     ML_root=ML_root,
-                #     output_root=reports_root,
-                #     task=task,
-                #     run=run,
-                #     session=session,
-                #     ML_algorithm="Logistic regression",
-                # )
+                try:
+                    plot_ML_results(
+                        ML_root=ML_root,
+                        output_root=reports_root,
+                        task=task,
+                        run=run,
+                        session=session,
+                        ML_algorithm="Logistic regression",
+                    )
+                except Exception as e:
+                    print(f"Error in plotting ML results for Logistic regression: {e}")
+                try:
+                    plot_clustering_results(
+                        ML_root=ML_root,
+                        output_root=reports_root,
+                        task=task,
+                        run=run,
+                        session=session,
+                    )
+                except Exception as e:
+                    print(f"Error in plotting clustering results: {e}")
 
     # create html report
     try:

From 9e3d93ea1c74a47719a75154b0113ddde489a589 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 2 Jul 2024 13:47:15 -0400
Subject: [PATCH 069/401] minor change

---
 task_dFC/generate_report.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 7fd231c..1b0bbb5 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -159,6 +159,7 @@ def plot_roi_signals(
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points
     fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = min(fig_width, 500)
     plt.figure(figsize=(fig_width, 5))
     for i in nodes_list:
         plt.plot(time[start_TR:end_TR], BOLD.data[i, start_TR:end_TR], linewidth=4)
@@ -215,6 +216,7 @@ def plot_event_labels(
     end_timepoint = int(end_time / TR_task)
     # keep the figure width proportional to the number of time points
     fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = min(fig_width, 500)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
         time[start_timepoint:end_timepoint],
@@ -283,6 +285,7 @@ def plot_task_presence(
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points in data
     fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = min(fig_width, 500)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
         time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
@@ -643,6 +646,7 @@ def plot_dFC_clustering(
         end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
 
         fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+        fig_width = min(fig_width, 500)
         plt.figure(figsize=(fig_width, 5))
         time = TR_array[start_TR_idx:end_TR_idx] * TR_mri
         plt.plot(

From ef88cd6a6b7c986a82347642aea894fa5fa3e7c1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 8 Jul 2024 13:49:30 -0400
Subject: [PATCH 070/401] add random forest to ML

---
 task_dFC/ML.py | 226 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 158 insertions(+), 68 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index b81f3a2..f288d9f 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -5,9 +5,10 @@
 import numpy as np
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score
-from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -354,6 +355,133 @@ def dFC_feature_extraction(
     )
 
 
+def logistic_regression_classify(X_train, y_train, X_test, y_test):
+    """
+    Logistic regression classification
+    """
+    # create a pipeline with a logistic regression model to find the best C
+    logistic_reg = make_pipeline(StandardScaler(), LogisticRegression())
+    # create a dictionary of all values we want to test for C
+    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+    # use gridsearch to test all values for C
+    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
+    # fit model to data
+    lr_gscv.fit(X_train, y_train)
+
+    C = lr_gscv.best_params_["logisticregression__C"]
+
+    log_reg = make_pipeline(
+        StandardScaler(),
+        LogisticRegression(C=C),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "log_reg_model": log_reg,
+        "log_reg_C": C,
+        "log_reg_train_score": log_reg.score(X_train, y_train),
+        "log_reg_test_score": log_reg.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
+def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95):
+    """
+    KNN classification
+    """
+    # find num_PCs
+    pca = PCA(svd_solver="full", whiten=False)
+    pca.fit(X_train)
+    num_PCs = (
+        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
+        + 1
+    )
+
+    # create a pipeline with a knn model to find the best n_neighbors
+    knn = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        KNeighborsClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_neighbors
+    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
+    # use gridsearch to test all values for n_neighbors
+    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
+    # fit model to data
+    knn_gscv.fit(X_train, y_train)
+
+    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
+
+    neigh = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        KNeighborsClassifier(n_neighbors=n_neighbors),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "KNN_pca": pca,
+        "KNN_num_PCs": num_PCs,
+        "KNN_cv_results": knn_gscv.cv_results_,
+        "KNN_model": neigh,
+        "KNN_train_score": neigh.score(X_train, y_train),
+        "KNN_test_score": neigh.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
+def random_forest_classify(
+    X_train, y_train, X_test, y_test, explained_var_threshold=0.95
+):
+    """
+    Random Forest classification
+    """
+    # find num_PCs
+    pca = PCA(svd_solver="full", whiten=False)
+    pca.fit(X_train)
+    num_PCs = (
+        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
+        + 1
+    )
+    num_PCs = min(num_PCs, 100)
+
+    # create a pipeline with a random forest model to find the best n_estimators
+    rf = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        RandomForestClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_estimators
+    param_grid = {
+        "randomforestclassifier__n_estimators": [10, 50, 100, 200],
+        "randomforestclassifier__max_depth": [None, 5, 10, 15, 20, 30],
+    }
+    # use gridsearch to test all values for n_estimators
+    rf_gscv = GridSearchCV(rf, param_grid, cv=5)
+    # fit model to data
+    rf_gscv.fit(X_train, y_train)
+
+    n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"]
+    max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"]
+
+    rf = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs),
+        RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "RF_pca": pca,
+        "RF_num_PCs": num_PCs,
+        "RF_cv_results": rf_gscv.cv_results_,
+        "RF_model": rf,
+        "RF_train_score": rf.score(X_train, y_train),
+        "RF_test_score": rf.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
 def task_presence_classification(
     task,
     dFC_id,
@@ -367,7 +495,7 @@ def task_presence_classification(
     explained_var_threshold=0.95,
 ):
     """
-    perform task presence classification using logistic regression and KNN
+    perform task presence classification using logistic regression, KNN, or Random Forest
     for a given task and dFC method and run.
     """
     if run is None:
@@ -412,72 +540,25 @@ def task_presence_classification(
     print("task presence classification ...")
 
     # logistic regression
-    logistic_reg = make_pipeline(StandardScaler(), LogisticRegression())
-    # create a dictionary of all values we want to test for C
-    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
-    # use gridsearch to test all values for C
-    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
-    # fit model to data
-    lr_gscv.fit(X_train, y_train)
-
-    C = lr_gscv.best_params_["logisticregression__C"]
+    log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test)
 
-    log_reg = make_pipeline(
-        StandardScaler(),
-        LogisticRegression(C=C),
-    ).fit(X_train, y_train)
+    # # KNN
+    # KNN_RESULT = KNN_classify(
+    #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
+    # )
 
-    # KNN
-    # find num_PCs
-    pca = PCA(svd_solver="full", whiten=False)
-    pca.fit(X_train)
-    num_PCs = (
-        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
-        + 1
-    )
-
-    # create a pipeline with a knn model to find the best n_neighbors
-    knn = make_pipeline(
-        StandardScaler(),
-        PCA(n_components=num_PCs),
-        KNeighborsClassifier(),
+    # Random Forest
+    RF_RESULT = random_forest_classify(
+        X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
     )
-    # create a dictionary of all values we want to test for n_neighbors
-    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
-    # use gridsearch to test all values for n_neighbors
-    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
-    # fit model to data
-    knn_gscv.fit(X_train, y_train)
 
-    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
-
-    neigh = make_pipeline(
-        StandardScaler(),
-        PCA(n_components=num_PCs),
-        KNeighborsClassifier(n_neighbors=n_neighbors),
-    ).fit(X_train, y_train)
-
-    ML_RESULT = {
-        "logistic regression": log_reg,
-        "logistic regression C": C,
-        "logistic regression train score": log_reg.score(X_train, y_train),
-        "logistic regression test score": log_reg.score(X_test, y_test),
-        "pca": pca,
-        "num_PCs": num_PCs,
-        "cv_results": knn_gscv.cv_results_,
-        "KNN": neigh,
-        "KNN train score": neigh.score(X_train, y_train),
-        "KNN test score": neigh.score(X_test, y_test),
-    }
-
-    print(
-        f"Logistic regression train score {measure_name} {task}: {log_reg.score(X_train, y_train)}"
-    )
-    print(
-        f"Logistic regression test score {measure_name} {task}: {log_reg.score(X_test, y_test)}"
-    )
-    print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}")
-    print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}")
+    ML_RESULT = {}
+    for key in log_reg_RESULT:
+        ML_RESULT[key] = log_reg_RESULT[key]
+    # for key in KNN_RESULT:
+    #     ML_RESULT[key] = KNN_RESULT[key]
+    for key in RF_RESULT:
+        ML_RESULT[key] = RF_RESULT[key]
 
     # measure pred score on each subj
 
@@ -488,8 +569,12 @@ def task_presence_classification(
         "run": list(),
         "dFC method": list(),
         "Logistic regression accuracy": list(),
-        "KNN accuracy": list(),
+        # "KNN accuracy": list(),
     }
+    log_reg = log_reg_RESULT["log_reg_model"]
+    # KNN = KNN_RESULT["KNN_model"]
+    RF = RF_RESULT["RF_model"]
+
     for subj in SUBJECTS:
         ML_scores["subj_id"].append(subj)
         if subj in train_subjects:
@@ -502,12 +587,16 @@ def task_presence_classification(
             target = y_test[subj_label_test == subj]
 
         pred_lr = log_reg.predict(features)
-        pred_KNN = neigh.predict(features)
+        # pred_KNN = KNN.predict(features)
+        pred_RF = RF.predict(features)
 
         ML_scores["Logistic regression accuracy"].append(
             balanced_accuracy_score(target, pred_lr)
         )
-        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
+        # ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
+        ML_scores["Random Forest accuracy"].append(
+            balanced_accuracy_score(target, pred_RF)
+        )
 
         ML_scores["task"].append(task)
         ML_scores["run"].append(run)
@@ -637,7 +726,8 @@ def run_classification(
             "run": list(),
             "dFC method": list(),
             "Logistic regression accuracy": list(),
-            "KNN accuracy": list(),
+            # "KNN accuracy": list(),
+            "Random Forest accuracy": list(),
         }
         for dFC_id in range(0, 7):
             print(f"=================== dFC {dFC_id} ===================")

From 571e8cd564944d41f6d820ecb6a35c9a1e202968 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 8 Jul 2024 18:36:07 -0400
Subject: [PATCH 071/401] paralel dFC_ids in ML

---
 task_dFC/ML.py | 183 +++++++++++++++++++++++++------------------------
 1 file changed, 95 insertions(+), 88 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index f288d9f..db2489e 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -707,6 +707,7 @@ def task_presence_clustering(
 
 
 def run_classification(
+    dFC_id,
     TASKS,
     RUNS,
     SESSIONS,
@@ -729,42 +730,41 @@ def run_classification(
             # "KNN accuracy": list(),
             "Random Forest accuracy": list(),
         }
-        for dFC_id in range(0, 7):
-            print(f"=================== dFC {dFC_id} ===================")
-
-            ML_RESULT = {}
-            for task_id, task in enumerate(TASKS):
-                ML_RESULT[task] = {}
-                for run in RUNS[task]:
-                    ML_RESULT_new, ML_scores_new = task_presence_classification(
-                        task=task,
-                        dFC_id=dFC_id,
-                        roi_root=roi_root,
-                        dFC_root=dFC_root,
-                        run=run,
-                        session=session,
-                        dynamic_pred=dynamic_pred,
-                        normalize_dFC=normalize_dFC,
-                    )
-                    if run is None:
-                        ML_RESULT[task] = ML_RESULT_new
-                    else:
-                        ML_RESULT[task][run] = ML_RESULT_new
-                    for key in ML_scores:
-                        ML_scores[key].extend(ML_scores_new[key])
 
-            if session is None:
-                folder = f"{output_root}"
-            else:
-                folder = f"{output_root}/{session}"
-            if not os.path.exists(folder):
-                os.makedirs(folder)
-            np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
+        ML_RESULT = {}
+        for task_id, task in enumerate(TASKS):
+            ML_RESULT[task] = {}
+            for run in RUNS[task]:
+                ML_RESULT_new, ML_scores_new = task_presence_classification(
+                    task=task,
+                    dFC_id=dFC_id,
+                    roi_root=roi_root,
+                    dFC_root=dFC_root,
+                    run=run,
+                    session=session,
+                    dynamic_pred=dynamic_pred,
+                    normalize_dFC=normalize_dFC,
+                )
+                if run is None:
+                    ML_RESULT[task] = ML_RESULT_new
+                else:
+                    ML_RESULT[task][run] = ML_RESULT_new
+                for key in ML_scores:
+                    ML_scores[key].extend(ML_scores_new[key])
 
-        np.save(f"{folder}/ML_scores_classify.npy", ML_scores)
+        if session is None:
+            folder = f"{output_root}"
+        else:
+            folder = f"{output_root}/{session}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
+
+        np.save(f"{folder}/ML_scores_classify_{dFC_id}.npy", ML_scores)
 
 
 def run_clustering(
+    dFC_id,
     TASKS,
     RUNS,
     SESSIONS,
@@ -783,40 +783,36 @@ def run_clustering(
             "dFC method": list(),
             "Kmeans ARI": list(),
         }
-        for dFC_id in range(0, 7):
-            print(f"=================== dFC {dFC_id} ===================")
-
-            clustering_RESULTS = {}
-            for task_id, task in enumerate(TASKS):
-                clustering_RESULTS[task] = {}
-                for run in RUNS[task]:
-                    clustering_RESULTS_new, clustering_scores_new = (
-                        task_presence_clustering(
-                            task=task,
-                            dFC_id=dFC_id,
-                            roi_root=roi_root,
-                            dFC_root=dFC_root,
-                            run=run,
-                            session=session,
-                            normalize_dFC=normalize_dFC,
-                        )
-                    )
-                    if run is None:
-                        clustering_RESULTS[task] = clustering_RESULTS_new
-                    else:
-                        clustering_RESULTS[task][run] = clustering_RESULTS_new
-                    for key in clustering_scores:
-                        clustering_scores[key].extend(clustering_scores_new[key])
 
-            if session is None:
-                folder = f"{output_root}"
-            else:
-                folder = f"{output_root}/{session}"
-            if not os.path.exists(folder):
-                os.makedirs(folder)
-            np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS)
+        clustering_RESULTS = {}
+        for task_id, task in enumerate(TASKS):
+            clustering_RESULTS[task] = {}
+            for run in RUNS[task]:
+                clustering_RESULTS_new, clustering_scores_new = task_presence_clustering(
+                    task=task,
+                    dFC_id=dFC_id,
+                    roi_root=roi_root,
+                    dFC_root=dFC_root,
+                    run=run,
+                    session=session,
+                    normalize_dFC=normalize_dFC,
+                )
+                if run is None:
+                    clustering_RESULTS[task] = clustering_RESULTS_new
+                else:
+                    clustering_RESULTS[task][run] = clustering_RESULTS_new
+                for key in clustering_scores:
+                    clustering_scores[key].extend(clustering_scores_new[key])
+
+        if session is None:
+            folder = f"{output_root}"
+        else:
+            folder = f"{output_root}/{session}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS)
 
-        np.save(f"{folder}/clustering_scores.npy", clustering_scores)
+        np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores)
 
 
 #######################################################################################
@@ -885,32 +881,43 @@ def run_clustering(
         roi_root=roi_root,
         output_root=ML_root,
     )
-
     print("Task features extraction finished.")
-    print("Task presence classification started ...")
-    run_classification(
-        TASKS=TASKS,
-        RUNS=RUNS,
-        SESSIONS=SESSIONS,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        output_root=ML_root,
-        dynamic_pred="no",
-        normalize_dFC=True,
-    )
-    print("Task presence classification finished.")
-    print("Task presence clustering started ...")
-    run_clustering(
-        TASKS=TASKS,
-        RUNS=RUNS,
-        SESSIONS=SESSIONS,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        output_root=ML_root,
-        normalize_dFC=True,
-    )
-    print("Task presence clustering finished.")
 
-    print("Task presence prediction CODE finished running.")
+    job_id = int(os.getenv("SGE_TASK_ID"))
+    dFC_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
+
+    print(f"Task presence classification started for dFC ID {dFC_id}...")
+    try:
+        run_classification(
+            dFC_id=dFC_id,
+            TASKS=TASKS,
+            RUNS=RUNS,
+            SESSIONS=SESSIONS,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            output_root=ML_root,
+            dynamic_pred="no",
+            normalize_dFC=True,
+        )
+    except Exception as e:
+        print(f"Error in classification for dFC ID {dFC_id}: {e}")
+    print(f"Task presence classification finished for dFC ID {dFC_id}.")
+    print(f"Task presence clustering started for dFC ID {dFC_id} ...")
+    try:
+        run_clustering(
+            dFC_id=dFC_id,
+            TASKS=TASKS,
+            RUNS=RUNS,
+            SESSIONS=SESSIONS,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            output_root=ML_root,
+            normalize_dFC=True,
+        )
+    except Exception as e:
+        print(f"Error in clustering for dFC ID {dFC_id}: {e}")
+
+    print(f"Task presence clustering finished for dFC ID {dFC_id}.")
+    print(f"Task presence prediction finished for dFC ID {dFC_id}.")
 
 #######################################################################################

From abf27ed53e508c878cfe74314901a7ea6a50c7db Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 8 Jul 2024 19:28:59 -0400
Subject: [PATCH 072/401] minor change

---
 task_dFC/ML.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index db2489e..080efb1 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -454,7 +454,7 @@ def random_forest_classify(
     # create a dictionary of all values we want to test for n_estimators
     param_grid = {
         "randomforestclassifier__n_estimators": [10, 50, 100, 200],
-        "randomforestclassifier__max_depth": [None, 5, 10, 15, 20, 30],
+        "randomforestclassifier__max_depth": [None, 5, 10, 20, 30],
     }
     # use gridsearch to test all values for n_estimators
     rf_gscv = GridSearchCV(rf, param_grid, cv=5)

From e530e07bcaa7cfefc5ab30260a92c586609e5475 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 8 Jul 2024 20:04:00 -0400
Subject: [PATCH 073/401] add new tasks to simul

---
 pydfc/simul_utils.py             | 54 +++++++++++++++++++++++++++++++-
 simul_dFC/task_data_simulator.py | 33 +++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index b4e155f..fa4631e 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -60,6 +60,7 @@ def simulate_task_BOLD(
     sim_length,
     BOLD_period,
     TAVG_period,
+    num_stimulated_regions=5,
     global_conn_coupling_coef=0.0126,
     D=0.001,
     conn_speed=1.0,
@@ -68,6 +69,27 @@ def simulate_task_BOLD(
 ):
     """
     Simulate BOLD signal for a task.
+
+    Parameters
+    ----------
+    onset_time : float
+        The onset time of the task.
+    task_duration : float
+        The duration of the task.
+    task_block_duration : float
+        The duration of the task block.
+    sim_length : float
+        The length of the simulation.
+    BOLD_period : float
+        The BOLD period.
+    TAVG_period : float
+        The TAVG period.
+    num_stimulated_regions : int, optional
+        The number of stimulated regions. The default is 5.
+        if num_stimulated_regions is 5, the stimulated regions are:
+        [0, 7, 13, 33, 42]
+        if num_stimulated_regions is 15, the stimulated regions are:
+        [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]
     """
     # randomize some parameters for each subjects
     onset = np.random.normal(loc=onset_time, scale=0.5)  # seconds
@@ -78,8 +100,33 @@ def simulate_task_BOLD(
     conn.speed = np.array([conn_speed_rand])
 
     # configure stimulus spatial pattern
+    if num_stimulated_regions == 5:
+        stimulated_regions_list = [0, 7, 13, 33, 42]
+    elif num_stimulated_regions == 15:
+        stimulated_regions_list = [
+            0,
+            5,
+            10,
+            15,
+            20,
+            25,
+            30,
+            35,
+            40,
+            45,
+            50,
+            55,
+            60,
+            65,
+            70,
+        ]
+    else:
+        stimulated_regions_list = np.random.choice(
+            np.arange(76), num_stimulated_regions, replace=False
+        )
+        stimulated_regions_list = list(stimulated_regions_list)
     weighting = create_random_stimulus_weights(
-        stimulated_regions_list=[0, 7, 13, 33, 42], n_regions=76
+        stimulated_regions_list=stimulated_regions_list, n_regions=76
     )
 
     stimulus = create_stimulus(
@@ -227,6 +274,7 @@ def simulate_task_BOLD_TS(
     sim_length,
     BOLD_period,
     TAVG_period,
+    num_stimulated_regions=5,
     global_conn_coupling_coef=0.0126,
     D=0.001,
     conn_speed=1.0,
@@ -244,6 +292,7 @@ def simulate_task_BOLD_TS(
             sim_length=sim_length,
             BOLD_period=BOLD_period,
             TAVG_period=TAVG_period,
+            num_stimulated_regions=num_stimulated_regions,
             global_conn_coupling_coef=global_conn_coupling_coef,
             D=D,
             conn_speed=conn_speed,
@@ -298,6 +347,8 @@ def simulate_task_data(subj_id, task_info):
                 The BOLD period.
             - TAVG_period: float
                 The TAVG period.
+            - num_stimulated_regions: int
+                The number of stimulated regions.
             - global_conn_coupling_coef: float
                 The global connectivity coupling coefficient.
             - D: float
@@ -316,6 +367,7 @@ def simulate_task_data(subj_id, task_info):
         sim_length=task_info["sim_length"],
         BOLD_period=task_info["BOLD_period"],
         TAVG_period=task_info["TAVG_period"],
+        num_stimulated_regions=task_info["num_stimulated_regions"],
         global_conn_coupling_coef=task_info["global_conn_coupling_coef"],
         D=task_info["D"],
         conn_speed=task_info["conn_speed"],
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 24aa92a..912bfe1 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -53,6 +53,7 @@
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 5,
         "global_conn_coupling_coef": global_conn_coupling_coef,
         "D": D,
         "conn_speed": conn_speed,
@@ -66,6 +67,21 @@
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-lowFreqShortTask": {
+        "task_name": "task-lowFreqShortTask",
+        "onset_time": onset_time,
+        "task_duration": 1.0,
+        "task_block_duration": 20.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 5,
         "global_conn_coupling_coef": global_conn_coupling_coef,
         "D": D,
         "conn_speed": conn_speed,
@@ -79,6 +95,7 @@
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 5,
         "global_conn_coupling_coef": global_conn_coupling_coef,
         "D": D,
         "conn_speed": conn_speed,
@@ -92,6 +109,21 @@
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": global_conn_coupling_coef,
+        "D": D,
+        "conn_speed": conn_speed,
+        "dt": dt,
+    },
+    "task-lowFreqShortRestDominStimul": {
+        "task_name": "task-lowFreqShortRestDominStimul",
+        "onset_time": onset_time,
+        "task_duration": 12.0,
+        "task_block_duration": 20.0,
+        "sim_length": sim_length,
+        "BOLD_period": BOLD_period,
+        "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 15,
         "global_conn_coupling_coef": global_conn_coupling_coef,
         "D": D,
         "conn_speed": conn_speed,
@@ -105,6 +137,7 @@
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
+        "num_stimulated_regions": 5,
         "global_conn_coupling_coef": global_conn_coupling_coef,
         "D": D * 100,
         "conn_speed": conn_speed,

From fa6f28390a1d9ed5893fa7444e310713cd2aa94b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 8 Jul 2024 20:04:40 -0400
Subject: [PATCH 074/401] minor fix in report

---
 task_dFC/generate_report.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 1b0bbb5..3db79d1 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -414,7 +414,17 @@ def plot_dFC_matrices(
     end_TR = int(end_time / TR_mri)
     start_TR_idx = np.where(np.array(TRs) >= start_TR)[0][0]
     end_TR_idx = np.where(np.array(TRs) <= end_TR)[0][-1]
-    chosen_TRs = TRs[start_TR_idx:end_TR_idx]
+    # if the TR_mri is low which will cause the figure to be too wide,
+    # we will only plot a resampled version of the dFC matrices, e.g. to make it the same as TR_mri=2s
+    if TR_mri < 2:
+        TR_step = int(2 / TR_mri)
+        chosen_TRs = TRs[start_TR_idx:end_TR_idx:TR_step]
+        # raise warning if the TR_mri is low
+        print(
+            f"TR_mri is low ({TR_mri}s), the dFC matrices will be resampled to make the figure width reasonable"
+        )
+    else:
+        chosen_TRs = TRs[start_TR_idx:end_TR_idx]
 
     output_dir = f"{output_root}/subject_results/{subj}/dFC_matrices"
     if session is not None:

From 2278e9b843fb3cded9ab50df315f713dd9a4feaa Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 9 Jul 2024 13:24:55 -0400
Subject: [PATCH 075/401] add paradigm clustering

---
 task_dFC/ML.py | 139 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 132 insertions(+), 7 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 080efb1..e694133 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -396,11 +396,11 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95)
         np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
         + 1
     )
-
+    num_PCs = min(num_PCs, 100)
     # create a pipeline with a knn model to find the best n_neighbors
     knn = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs),
+        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         KNeighborsClassifier(),
     )
     # create a dictionary of all values we want to test for n_neighbors
@@ -414,7 +414,7 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95)
 
     neigh = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs),
+        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         KNeighborsClassifier(n_neighbors=n_neighbors),
     ).fit(X_train, y_train)
 
@@ -448,7 +448,7 @@ def random_forest_classify(
     # create a pipeline with a random forest model to find the best n_estimators
     rf = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs),
+        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         RandomForestClassifier(),
     )
     # create a dictionary of all values we want to test for n_estimators
@@ -466,7 +466,7 @@ def random_forest_classify(
 
     rf = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs),
+        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
     ).fit(X_train, y_train)
 
@@ -651,13 +651,14 @@ def task_presence_clustering(
     X_normalized = scaler.fit_transform(X)
     # PCA
     # find number of components that explain 95% of variance
-    pca = PCA()
+    pca = PCA(svd_solver="full", whiten=False)
     pca.fit(X_normalized)
     n_components = (
         np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
         + 1
     )
-    pca = PCA(n_components=n_components)
+    n_components = min(n_components, 100)
+    pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
     X_pca = pca.fit_transform(X_normalized)
     kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
     labels_pred = kmeans.fit_predict(X_pca)
@@ -815,6 +816,113 @@ def run_clustering(
         np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores)
 
 
+def task_paradigm_clustering(
+    dFC_id,
+    TASKS,
+    RUNS,
+    SESSIONS,
+    roi_root,
+    dFC_root,
+    output_root,
+    normalize_dFC=True,
+    explained_var_threshold=0.95,
+):
+    for session in SESSIONS:
+        # find SUBJECTS common to all tasks
+        for task_id, task in enumerate(TASKS):
+            if task_id == 0:
+                SUBJECTS = find_available_subjects(
+                    dFC_root=dFC_root, task=task, dFC_id=dFC_id
+                )
+            else:
+                SUBJECTS = np.intersect1d(
+                    SUBJECTS,
+                    find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id),
+                )
+        print(f"Number of subjects: {len(SUBJECTS)}")
+
+        X = None
+        y = None
+        for task_id, task in enumerate(TASKS):
+            for run in RUNS[task]:
+                X_new, _, _, _, _, _, measure_name = dFC_feature_extraction(
+                    task=task,
+                    train_subjects=SUBJECTS,
+                    test_subjects=[],
+                    dFC_id=dFC_id,
+                    roi_root=roi_root,
+                    dFC_root=dFC_root,
+                    run=run,
+                    session=session,
+                    dynamic_pred="no",
+                    normalize_dFC=normalize_dFC,
+                )
+                y_new = np.ones(X_new.shape[0]) * task_id
+                if X is None and y is None:
+                    X = X_new
+                    y = y_new
+                else:
+                    X = np.concatenate((X, X_new), axis=0)
+                    y = np.concatenate((y, y_new), axis=0)
+
+        assert X.shape[0] == y.shape[0], "Number of samples do not match."
+
+        # clustering
+        # apply kmeans clustering with PCA to dFC features
+
+        n_clusters = len(TASKS)  # corresponding to task paradigms
+
+        scaler = StandardScaler()
+        X_normalized = scaler.fit_transform(X)
+        # PCA
+        # find number of components that explain 95% of variance
+        pca = PCA(svd_solver="full", whiten=False)
+        pca.fit(X_normalized)
+        n_components = (
+            np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[
+                0
+            ][0]
+            + 1
+        )
+        n_components = min(n_components, 100)
+        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
+        X_pca = pca.fit_transform(X_normalized)
+        kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
+        labels_pred = kmeans.fit_predict(X_pca)
+
+        # ARI score
+        print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
+
+        # visualize clustering centroids
+        centroids = kmeans.cluster_centers_
+        centroids = pca.inverse_transform(centroids)
+        centroids = scaler.inverse_transform(centroids)
+        n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+        centroids_mat = dFC_vec2mat(centroids, n_regions)
+
+        task_paradigm_clstr_RESULTS = {
+            "StandardScaler": scaler,
+            "num_PCs": n_components,
+            "PCA": pca,
+            "kmeans": kmeans,
+            "ARI": adjusted_rand_score(y, labels_pred),
+            "centroids": centroids_mat,
+            "task_paradigms": TASKS,
+        }
+
+        if session is None:
+            folder = f"{output_root}"
+        else:
+            folder = f"{output_root}/{session}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+
+        np.save(
+            f"{folder}/task_paradigm_clstr_RESULTS_{dFC_id}.npy",
+            task_paradigm_clstr_RESULTS,
+        )
+
+
 #######################################################################################
 
 if __name__ == "__main__":
@@ -918,6 +1026,23 @@ def run_clustering(
         print(f"Error in clustering for dFC ID {dFC_id}: {e}")
 
     print(f"Task presence clustering finished for dFC ID {dFC_id}.")
+
+    print(f"Task paradigm clustering started for dFC ID {dFC_id} ...")
+    try:
+        task_paradigm_clustering(
+            dFC_id=dFC_id,
+            TASKS=TASKS,
+            RUNS=RUNS,
+            SESSIONS=SESSIONS,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            output_root=ML_root,
+            normalize_dFC=True,
+        )
+    except Exception as e:
+        print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}")
+
+    print(f"Task paradigm clustering finished for dFC ID {dFC_id}.")
     print(f"Task presence prediction finished for dFC ID {dFC_id}.")
 
 #######################################################################################

From a24931851bfe3806bb158ffa64592b388b3afa2a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Jul 2024 11:06:47 -0400
Subject: [PATCH 076/401] minor fix

---
 task_dFC/ML.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index e694133..eb853a1 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -570,6 +570,7 @@ def task_presence_classification(
         "dFC method": list(),
         "Logistic regression accuracy": list(),
         # "KNN accuracy": list(),
+        "Random Forest accuracy": list(),
     }
     log_reg = log_reg_RESULT["log_reg_model"]
     # KNN = KNN_RESULT["KNN_model"]

From c15f03416af6a7c0d0bd45b775985ea1940745a3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Jul 2024 10:48:00 -0400
Subject: [PATCH 077/401] change stimulated regions in simul

---
 pydfc/simul_utils.py             | 29 +++++++++--------------------
 simul_dFC/task_data_simulator.py |  2 +-
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index fa4631e..79a9cd6 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -88,8 +88,11 @@ def simulate_task_BOLD(
         The number of stimulated regions. The default is 5.
         if num_stimulated_regions is 5, the stimulated regions are:
         [0, 7, 13, 33, 42]
-        if num_stimulated_regions is 15, the stimulated regions are:
-        [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]
+        if num_stimulated_regions is 16, the stimulated regions are:
+        regions = list(range(0, 76, 5))
+        if num_stimulated_regions is 26, the stimulated regions are:
+        regions = list(range(0, 76, 3))
+        else, the stimulated regions are randomly selected.
     """
     # randomize some parameters for each subjects
     onset = np.random.normal(loc=onset_time, scale=0.5)  # seconds
@@ -102,24 +105,10 @@ def simulate_task_BOLD(
     # configure stimulus spatial pattern
     if num_stimulated_regions == 5:
         stimulated_regions_list = [0, 7, 13, 33, 42]
-    elif num_stimulated_regions == 15:
-        stimulated_regions_list = [
-            0,
-            5,
-            10,
-            15,
-            20,
-            25,
-            30,
-            35,
-            40,
-            45,
-            50,
-            55,
-            60,
-            65,
-            70,
-        ]
+    elif num_stimulated_regions == 16:
+        stimulated_regions_list = list(range(0, 76, 5))
+    elif num_stimulated_regions == 26:
+        stimulated_regions_list = list(range(0, 76, 3))
     else:
         stimulated_regions_list = np.random.choice(
             np.arange(76), num_stimulated_regions, replace=False
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 912bfe1..ba3b6c5 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -123,7 +123,7 @@
         "sim_length": sim_length,
         "BOLD_period": BOLD_period,
         "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 15,
+        "num_stimulated_regions": 26,
         "global_conn_coupling_coef": global_conn_coupling_coef,
         "D": D,
         "conn_speed": conn_speed,

From 6b53cd4ef80a1af11b2f2f4726b607e0a56b2956 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Jul 2024 11:40:15 -0400
Subject: [PATCH 078/401] add RF and GBT to report

---
 task_dFC/generate_report.py | 68 ++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 19 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 3db79d1..df60342 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -446,7 +446,7 @@ def plot_dFC_matrices(
 
 
 def plot_ML_results(
-    ML_root, output_root, task, run=None, session=None, ML_algorithm="KNN"
+    ML_root, output_root, task, run=None, session=None, ML_algorithm="Random Forest"
 ):
     """
     Plot the ML results for a given task, run and session.
@@ -457,16 +457,27 @@ def plot_ML_results(
         task: str, task name
         run: int, run number
         session: str, session name
-        ML_algorithm: str, ML algorithm name (default: KNN, other options: Logistic regression)
+        ML_algorithm: str, ML algorithm name (default: Random Forest, other options: Logistic regression, KNN, Gradient Boosting)
     """
+    # the ML_scores files are saved as ML_scores_classify_{dFC_id}.npy
+    # find all the ML_scores files in the directory
     if session is None:
-        ML_scores = np.load(
-            f"{ML_root}/ML_scores_classify.npy", allow_pickle="TRUE"
-        ).item()
+        input_dir = f"{ML_root}"
     else:
-        ML_scores = np.load(
-            f"{ML_root}/{session}/ML_scores_classify.npy", allow_pickle="TRUE"
-        ).item()
+        input_dir = f"{ML_root}/{session}"
+    ALL_ML_SCORES = os.listdir(input_dir)
+    ALL_ML_SCORES = [
+        score_file for score_file in ALL_ML_SCORES if "ML_scores_classify" in score_file
+    ]
+    ALL_ML_SCORES.sort()
+    ML_scores = None
+    for score_file in ALL_ML_SCORES:
+        ML_scores_new = np.load(f"{input_dir}/{score_file}", allow_pickle="TRUE").item()
+        if ML_scores is None:
+            ML_scores = ML_scores_new
+        else:
+            for key in ML_scores_new.keys():
+                ML_scores[key].extend(ML_scores_new[key])
 
     sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
 
@@ -506,6 +517,10 @@ def plot_ML_results(
         ML_algorithm_name = "LogReg"
     elif ML_algorithm == "KNN":
         ML_algorithm_name = "KNN"
+    elif ML_algorithm == "Random Forest":
+        ML_algorithm_name = "RF"
+    elif ML_algorithm == "Gradient Boosting":
+        ML_algorithm_name = "GBT"
 
     if run is None:
         plt.savefig(
@@ -538,14 +553,29 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         run: int, run number
         session: str, session name
     """
+    # the clustering_scores files are saved as clustering_scores_{dFC_id}.npy
+    # find all the clustering_scores files in the directory
     if session is None:
-        clustering_scores = np.load(
-            f"{ML_root}/clustering_scores.npy", allow_pickle="TRUE"
-        ).item()
+        input_dir = f"{ML_root}"
     else:
-        clustering_scores = np.load(
-            f"{ML_root}/{session}/clustering_scores.npy", allow_pickle="TRUE"
+        input_dir = f"{ML_root}/{session}"
+    ALL_CLUSTERING_SCORES = os.listdir(input_dir)
+    ALL_CLUSTERING_SCORES = [
+        score_file
+        for score_file in ALL_CLUSTERING_SCORES
+        if "clustering_scores" in score_file
+    ]
+    ALL_CLUSTERING_SCORES.sort()
+    clustering_scores = None
+    for score_file in ALL_CLUSTERING_SCORES:
+        clustering_scores_new = np.load(
+            f"{input_dir}/{score_file}", allow_pickle="TRUE"
         ).item()
+        if clustering_scores is None:
+            clustering_scores = clustering_scores_new
+        else:
+            for key in clustering_scores_new.keys():
+                clustering_scores[key].extend(clustering_scores_new[key])
 
     sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
 
@@ -956,15 +986,15 @@ def create_html_report_group_results(
                 else:
                     classification_dir = f"{group_dir}/classification"
 
-                # display KNN classification results
-                file.write("<h3>KNN</h3>\n")
+                # display Random Forest classification results
+                file.write("<h3>Random Forest</h3>\n")
                 if run is None:
                     classification_img = (
-                        f"{classification_dir}/ML_results_classify_KNN_{task}.png"
+                        f"{classification_dir}/ML_results_classify_RF_{task}.png"
                     )
                 else:
                     classification_img = (
-                        f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png"
+                        f"{classification_dir}/ML_results_classify_RF_{task}_{run}.png"
                     )
                 img = plt.imread(classification_img)
                 height, width, _ = img.shape
@@ -1233,10 +1263,10 @@ def create_html_report_group_results(
                         task=task,
                         run=run,
                         session=session,
-                        ML_algorithm="KNN",
+                        ML_algorithm="Random Forest",
                     )
                 except Exception as e:
-                    print(f"Error in plotting ML results for KNN: {e}")
+                    print(f"Error in plotting ML results for RF: {e}")
                 try:
                     plot_ML_results(
                         ML_root=ML_root,

From d9c8fc4eff1205d237c834a4fd817ac3b1b3e362 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Jul 2024 11:46:22 -0400
Subject: [PATCH 079/401] adjust fig width in report

---
 task_dFC/generate_report.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index df60342..44a9835 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -158,7 +158,7 @@ def plot_roi_signals(
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points
-    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = int(2.5 * (end_time - start_time) / 2)
     fig_width = min(fig_width, 500)
     plt.figure(figsize=(fig_width, 5))
     for i in nodes_list:
@@ -209,13 +209,13 @@ def plot_event_labels(
     task_data = load_task_data(roi_root, subj, task, run, session)
     Fs_task = task_data["Fs_task"]
     TR_task = 1 / Fs_task
-    TR_mri = task_data["TR_mri"]
+    # TR_mri = task_data["TR_mri"]
 
     time = np.arange(0, task_data["event_labels"].shape[0]) / Fs_task
     start_timepoint = int(start_time / TR_task)
     end_timepoint = int(end_time / TR_task)
     # keep the figure width proportional to the number of time points
-    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = int(2.5 * (end_time - start_time) / 2)
     fig_width = min(fig_width, 500)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
@@ -284,7 +284,7 @@ def plot_task_presence(
     start_TR = int(start_time / TR_mri)
     end_TR = int(end_time / TR_mri)
     # keep the figure width proportional to the number of time points in data
-    fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+    fig_width = int(2.5 * (end_time - start_time) / 2)
     fig_width = min(fig_width, 500)
     plt.figure(figsize=(fig_width, 5))
     plt.plot(
@@ -685,7 +685,7 @@ def plot_dFC_clustering(
         start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
         end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
 
-        fig_width = int(2.5 * (end_time - start_time) / TR_mri)
+        fig_width = int(2.5 * (end_time - start_time) / 2)
         fig_width = min(fig_width, 500)
         plt.figure(figsize=(fig_width, 5))
         time = TR_array[start_TR_idx:end_TR_idx] * TR_mri

From 6f45fc5b518c557d850fa311e71fea9834ed9feb Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 15 Jul 2024 12:04:20 -0400
Subject: [PATCH 080/401] add paradigm clstr to report

---
 task_dFC/generate_report.py | 123 ++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 44a9835..9045b59 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -628,6 +628,91 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
     plt.close()
 
 
+def plot_paradigm_clustering(
+    ML_root,
+    output_root,
+    session=None,
+):
+    """
+    Plot the clustering results for a given task, run and session.
+    parameters:
+    ----------
+        ML_root: str, path to ML results
+        output_root: str, path to save the figures
+        task: str, task name
+        run: int, run number
+        session: str, session name
+    """
+    # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
+    # find all the paradigm_clustering_RESULTS files in the directory
+    if session is None:
+        input_dir = f"{ML_root}"
+    else:
+        input_dir = f"{ML_root}/{session}"
+    ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
+    ALL_PARADIGM_CLUSTERING_RESULTS = [
+        result_file
+        for result_file in ALL_PARADIGM_CLUSTERING_RESULTS
+        if "task_paradigm_clstr_RESULTS_" in result_file
+    ]
+    ALL_PARADIGM_CLUSTERING_RESULTS.sort()
+    paradigm_clustering_RESULTS = {
+        "dFC method": [],
+        "ARI score": [],
+    }
+    for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
+        paradigm_clustering_RESULTS_new = np.load(
+            f"{input_dir}/{result_file}", allow_pickle="TRUE"
+        ).item()
+        paradigm_clustering_RESULTS["dFC method"].append(
+            result_file[result_file.find("task_paradigm_clstr_RESULTS_") + 27 : -4]
+        )
+        # paradigm_clustering_RESULTS["dFC method"].append(
+        #     paradigm_clustering_RESULTS_new["dFC_method"]
+        # )
+        paradigm_clustering_RESULTS["ARI score"].append(
+            paradigm_clustering_RESULTS_new["ARI"]
+        )
+
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
+
+    sns.set_style("darkgrid")
+
+    dataframe = pd.DataFrame(paradigm_clustering_RESULTS)
+
+    plt.figure(figsize=(10, 5))
+    g = sns.pointplot(
+        data=dataframe,
+        x="dFC method",
+        y="ARI score",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
+    g.axhline(0.0, color="r", linestyle="--")
+    if show_title:
+        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+
+    # save the figure
+    if session is None:
+        output_dir = f"{output_root}/group_results/paradigm_clustering"
+    else:
+        output_dir = f"{output_root}/group_results/paradigm_clustering/{session}"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    plt.savefig(
+        f"{output_dir}/paradigm_clustering_results.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
+
+
 def plot_dFC_clustering(
     dFC_root,
     subj,
@@ -1060,6 +1145,34 @@ def create_html_report_group_results(
                 )
 
                 file.write("<br>\n")
+
+    # paradigm clustering results
+    img_height = 300
+    file.write("<h1>Paradigm Clustering Results</h1>\n")
+    for session in SESSIONS:
+        if session is not None:
+            file.write(f"<h1> {session} </h1>\n")
+        if session is not None:
+            paradigm_clustering_dir = f"{group_dir}/paradigm_clustering/{session}"
+        else:
+            paradigm_clustering_dir = f"{group_dir}/paradigm_clustering"
+
+        # display paradigm clustering results
+        paradigm_clustering_img = (
+            f"{paradigm_clustering_dir}/paradigm_clustering_results.png"
+        )
+        img = plt.imread(paradigm_clustering_img)
+        height, width, _ = img.shape
+        # change the width so that height equals img_height
+        width = int(width * img_height / height)
+        # replace the path to the image with a relative path
+        paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
+        file.write(
+            f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
+        )
+
+        file.write("<br>\n")
+
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -1229,6 +1342,7 @@ def create_html_report_group_results(
         except Exception as e:
             print(f"Error in creating html report for subject results: {e}")
 
+    # plot group results
     # find the common run number for all tasks for task presence features
     common_run = None
     for task in TASKS:
@@ -1254,6 +1368,15 @@ def create_html_report_group_results(
         except Exception as e:
             print(f"Error in plotting task presence features: {e}")
 
+        try:
+            plot_paradigm_clustering(
+                ML_root=ML_root,
+                output_root=reports_root,
+                session=session,
+            )
+        except Exception as e:
+            print(f"Error in plotting paradigm clustering: {e}")
+
         for task in TASKS:
             for run in RUNS[task]:
                 try:

From cbbb149c3a82a78d7fac002e64b973fa35306023 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 15 Jul 2024 16:34:21 -0400
Subject: [PATCH 081/401] add plot centroids to report

---
 task_dFC/generate_report.py | 122 +++++++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 7 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 9045b59..24d8cc7 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -12,7 +12,13 @@
 from sklearn.preprocessing import StandardScaler
 
 from pydfc import DFC, data_loader, task_utils
-from pydfc.dfc_utils import TR_intersection, dFC_mat2vec, dFC_vec2mat, rank_norm
+from pydfc.dfc_utils import (
+    TR_intersection,
+    dFC_mat2vec,
+    dFC_vec2mat,
+    rank_norm,
+    visualize_conn_mat_dict,
+)
 
 ################################# Parameters ####################################
 
@@ -628,7 +634,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
     plt.close()
 
 
-def plot_paradigm_clustering(
+def plot_paradigm_clustering_score(
     ML_root,
     output_root,
     session=None,
@@ -691,7 +697,10 @@ def plot_paradigm_clustering(
     )
     g.axhline(0.0, color="r", linestyle="--")
     if show_title:
-        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+        g.set_title(
+            "Task Paradigm Clustering Performance",
+            fontdict={"fontsize": 10, "fontweight": "bold"},
+        )
 
     # save the figure
     if session is None:
@@ -713,6 +722,65 @@ def plot_paradigm_clustering(
     plt.close()
 
 
+def plot_paradigm_clstr_centroids(
+    ML_root,
+    output_root,
+    session=None,
+):
+    """ """
+    # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
+    # find all the paradigm_clustering_RESULTS files in the directory
+    if session is None:
+        input_dir = f"{ML_root}"
+    else:
+        input_dir = f"{ML_root}/{session}"
+
+    if session is None:
+        output_dir = f"{output_root}/group_results/paradigm_clustering_centroids"
+    else:
+        output_dir = (
+            f"{output_root}/group_results/paradigm_clustering_centroids/{session}"
+        )
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
+    ALL_PARADIGM_CLUSTERING_RESULTS = [
+        result_file
+        for result_file in ALL_PARADIGM_CLUSTERING_RESULTS
+        if "task_paradigm_clstr_RESULTS_" in result_file
+    ]
+    ALL_PARADIGM_CLUSTERING_RESULTS.sort()
+
+    for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
+        paradigm_clustering_RESULTS_new = np.load(
+            f"{input_dir}/{result_file}", allow_pickle="TRUE"
+        ).item()
+
+        # measure_name = paradigm_clustering_RESULTS_new["dFC_method"]
+        measure_name = result_file[
+            result_file.find("task_paradigm_clstr_RESULTS_") + 28 : -4
+        ]
+        centroids_mats = paradigm_clustering_RESULTS_new["centroids"]
+
+        centroids_dict = {}
+        for i, centroid_mat in enumerate(centroids_mats):
+            centroids_dict[f"Cluster {i + 1}"] = centroid_mat
+
+        visualize_conn_mat_dict(
+            data=centroids_dict,
+            title=f"Task Paradigm Centroids {measure_name}",
+            cmap="seismic",
+            normalize=True,
+            disp_diag=False,
+            save_image=True,
+            output_root=output_dir,
+            center_0=True,
+            # node_networks=None,
+        )
+
+
 def plot_dFC_clustering(
     dFC_root,
     subj,
@@ -1147,7 +1215,6 @@ def create_html_report_group_results(
                 file.write("<br>\n")
 
     # paradigm clustering results
-    img_height = 300
     file.write("<h1>Paradigm Clustering Results</h1>\n")
     for session in SESSIONS:
         if session is not None:
@@ -1157,7 +1224,9 @@ def create_html_report_group_results(
         else:
             paradigm_clustering_dir = f"{group_dir}/paradigm_clustering"
 
-        # display paradigm clustering results
+        # display paradigm clustering scores
+        img_height = 300
+        file.write("<h2>Paradigm Clustering Scores</h2>\n")
         paradigm_clustering_img = (
             f"{paradigm_clustering_dir}/paradigm_clustering_results.png"
         )
@@ -1173,6 +1242,36 @@ def create_html_report_group_results(
 
         file.write("<br>\n")
 
+        # display paradigm clustering centroids
+        img_height = 300
+        file.write("<h2>Paradigm Clustering Centroids</h2>\n")
+        # find all png files in the directory
+        paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids"
+        for file_name in os.listdir(paradigm_clustering_centroids_dir):
+            if file_name.endswith(".png"):
+                measure_name = file_name[
+                    file_name.find("Task_Paradigm_Centroids_") + 25 : -4
+                ]
+                file.write(f"<h3>{measure_name}</h3>\n")
+                paradigm_clustering_centroids_img = (
+                    f"{paradigm_clustering_centroids_dir}/{file_name}"
+                )
+                # get the original size of the image
+                img = plt.imread(paradigm_clustering_centroids_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                paradigm_clustering_centroids_img = (
+                    paradigm_clustering_centroids_img.replace(group_dir, ".")
+                )
+                file.write(
+                    f"<img src='{paradigm_clustering_centroids_img}' alt='Paradigm clustering centroids' width='{width}' height='{img_height}'>\n"
+                )
+                file.write("<br>\n")
+
+        file.write("<br>\n")
+
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -1369,13 +1468,22 @@ def create_html_report_group_results(
             print(f"Error in plotting task presence features: {e}")
 
         try:
-            plot_paradigm_clustering(
+            plot_paradigm_clustering_score(
+                ML_root=ML_root,
+                output_root=reports_root,
+                session=session,
+            )
+        except Exception as e:
+            print(f"Error in plotting paradigm clustering scores: {e}")
+
+        try:
+            plot_paradigm_clstr_centroids(
                 ML_root=ML_root,
                 output_root=reports_root,
                 session=session,
             )
         except Exception as e:
-            print(f"Error in plotting paradigm clustering: {e}")
+            print(f"Error in plotting paradigm clustering centroids: {e}")
 
         for task in TASKS:
             for run in RUNS[task]:

From 25c96028744baeabd374140625a5b1cc2e26cb4f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 15 Jul 2024 17:27:49 -0400
Subject: [PATCH 082/401] minor fix

---
 task_dFC/generate_report.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 24d8cc7..f1c7a4c 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -775,7 +775,7 @@ def plot_paradigm_clstr_centroids(
             normalize=True,
             disp_diag=False,
             save_image=True,
-            output_root=output_dir,
+            output_root=f"{output_dir}/",
             center_0=True,
             # node_networks=None,
         )

From cb0a1b490575d060380f5e85d31647e6adc5dd70 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 15 Jul 2024 17:50:14 -0400
Subject: [PATCH 083/401] switch to GBT in ML

---
 task_dFC/ML.py              | 106 +++++++++++++++++++++++++++++++-----
 task_dFC/generate_report.py |   2 +-
 2 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index eb853a1..31eee54 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -5,7 +5,7 @@
 import numpy as np
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
@@ -482,6 +482,62 @@ def random_forest_classify(
     return RESULT
 
 
+def gradient_boosting_classify(
+    X_train, y_train, X_test, y_test, explained_var_threshold=0.95
+):
+    """
+    Gradient Boosting classification
+    """
+    # find num_PCs
+    pca = PCA(svd_solver="full", whiten=False)
+    pca.fit(X_train)
+    num_PCs = (
+        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
+        + 1
+    )
+    num_PCs = min(num_PCs, 100)
+
+    # create a pipeline with a gradient boosting model to find the best n_estimators
+    gb = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
+        GradientBoostingClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_estimators
+    param_grid = {
+        "gradientboostingclassifier__n_estimators": [10, 50, 100, 200],
+        "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2],
+        "gradientboostingclassifier__max_depth": [3, 5, 10],
+    }
+    # use gridsearch to test all values for n_estimators
+    gb_gscv = GridSearchCV(gb, param_grid, cv=5)
+    # fit model to data
+    gb_gscv.fit(X_train, y_train)
+
+    n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"]
+    learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"]
+    max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"]
+
+    gb = make_pipeline(
+        StandardScaler(),
+        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
+        GradientBoostingClassifier(
+            n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate
+        ),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "GB_pca": pca,
+        "GB_num_PCs": num_PCs,
+        "GB_cv_results": gb_gscv.cv_results_,
+        "GB_model": gb,
+        "GB_train_score": gb.score(X_train, y_train),
+        "GB_test_score": gb.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
 def task_presence_classification(
     task,
     dFC_id,
@@ -495,7 +551,7 @@ def task_presence_classification(
     explained_var_threshold=0.95,
 ):
     """
-    perform task presence classification using logistic regression, KNN, or Random Forest
+    perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting
     for a given task and dFC method and run.
     """
     if run is None:
@@ -547,8 +603,13 @@ def task_presence_classification(
     #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
     # )
 
-    # Random Forest
-    RF_RESULT = random_forest_classify(
+    # # Random Forest
+    # RF_RESULT = random_forest_classify(
+    #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
+    # )
+
+    # Gradient Boosting
+    GBT_RESULT = gradient_boosting_classify(
         X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
     )
 
@@ -557,8 +618,10 @@ def task_presence_classification(
         ML_RESULT[key] = log_reg_RESULT[key]
     # for key in KNN_RESULT:
     #     ML_RESULT[key] = KNN_RESULT[key]
-    for key in RF_RESULT:
-        ML_RESULT[key] = RF_RESULT[key]
+    # for key in RF_RESULT:
+    #     ML_RESULT[key] = RF_RESULT[key]
+    for key in GBT_RESULT:
+        ML_RESULT[key] = GBT_RESULT[key]
 
     # measure pred score on each subj
 
@@ -570,11 +633,13 @@ def task_presence_classification(
         "dFC method": list(),
         "Logistic regression accuracy": list(),
         # "KNN accuracy": list(),
-        "Random Forest accuracy": list(),
+        # "Random Forest accuracy": list(),
+        "Gradient Boosting accuracy": list(),
     }
     log_reg = log_reg_RESULT["log_reg_model"]
     # KNN = KNN_RESULT["KNN_model"]
-    RF = RF_RESULT["RF_model"]
+    # RF = RF_RESULT["RF_model"]
+    GBT = GBT_RESULT["GB_model"]
 
     for subj in SUBJECTS:
         ML_scores["subj_id"].append(subj)
@@ -589,14 +654,18 @@ def task_presence_classification(
 
         pred_lr = log_reg.predict(features)
         # pred_KNN = KNN.predict(features)
-        pred_RF = RF.predict(features)
+        # pred_RF = RF.predict(features)
+        pred_GBT = GBT.predict(features)
 
         ML_scores["Logistic regression accuracy"].append(
             balanced_accuracy_score(target, pred_lr)
         )
         # ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
-        ML_scores["Random Forest accuracy"].append(
-            balanced_accuracy_score(target, pred_RF)
+        # ML_scores["Random Forest accuracy"].append(
+        #     balanced_accuracy_score(target, pred_RF)
+        # )
+        ML_scores["Gradient Boosting accuracy"].append(
+            balanced_accuracy_score(target, pred_GBT)
         )
 
         ML_scores["task"].append(task)
@@ -730,7 +799,8 @@ def run_classification(
             "dFC method": list(),
             "Logistic regression accuracy": list(),
             # "KNN accuracy": list(),
-            "Random Forest accuracy": list(),
+            # "Random Forest accuracy": list(),
+            "Gradient Boosting accuracy": list(),
         }
 
         ML_RESULT = {}
@@ -844,9 +914,10 @@ def task_paradigm_clustering(
 
         X = None
         y = None
+        measure_name = None
         for task_id, task in enumerate(TASKS):
             for run in RUNS[task]:
-                X_new, _, _, _, _, _, measure_name = dFC_feature_extraction(
+                X_new, _, _, _, _, _, measure_name_new = dFC_feature_extraction(
                     task=task,
                     train_subjects=SUBJECTS,
                     test_subjects=[],
@@ -858,6 +929,14 @@ def task_paradigm_clustering(
                     dynamic_pred="no",
                     normalize_dFC=normalize_dFC,
                 )
+
+                if measure_name is not None:
+                    assert (
+                        measure_name == measure_name_new
+                    ), "dFC measure is not consistent."
+                else:
+                    measure_name = measure_name_new
+
                 y_new = np.ones(X_new.shape[0]) * task_id
                 if X is None and y is None:
                     X = X_new
@@ -902,6 +981,7 @@ def task_paradigm_clustering(
         centroids_mat = dFC_vec2mat(centroids, n_regions)
 
         task_paradigm_clstr_RESULTS = {
+            "dFC_method": measure_name,
             "StandardScaler": scaler,
             "num_PCs": n_components,
             "PCA": pca,
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index f1c7a4c..5c56b02 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1250,7 +1250,7 @@ def create_html_report_group_results(
         for file_name in os.listdir(paradigm_clustering_centroids_dir):
             if file_name.endswith(".png"):
                 measure_name = file_name[
-                    file_name.find("Task_Paradigm_Centroids_") + 25 : -4
+                    file_name.find("Task_Paradigm_Centroids_") + 24 : -4
                 ]
                 file.write(f"<h3>{measure_name}</h3>\n")
                 paradigm_clustering_centroids_img = (

From 719d7a90306ca4aa533759b4a13789bc547ad1a7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 15 Jul 2024 18:05:14 -0400
Subject: [PATCH 084/401] add run_simulator

---
 simul_dFC/run_scripts/run_simulator.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 simul_dFC/run_scripts/run_simulator.sh

diff --git a/simul_dFC/run_scripts/run_simulator.sh b/simul_dFC/run_scripts/run_simulator.sh
new file mode 100644
index 0000000..e7f6394
--- /dev/null
+++ b/simul_dFC/run_scripts/run_simulator.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -j y
+#$ -o logs/simul_out.txt
+#$ -e logs/simul_err.txt
+#$ -q origami.q
+#$ -l h_vmem=8G
+#$ -t 1:200
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py"
+conda deactivate

From 80bef3a78d3661409603e76bf9a8570bb3074b6a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 18 Jul 2024 17:27:27 -0400
Subject: [PATCH 085/401] add SI to ML

---
 task_dFC/ML.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 31eee54..a06e686 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -7,7 +7,7 @@
 from sklearn.decomposition import PCA
 from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score
+from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import make_pipeline
@@ -758,6 +758,8 @@ def task_presence_clustering(
         "run": list(),
         "dFC method": list(),
         "Kmeans ARI": list(),
+        "SI": list(),
+        "SI_pca": list(),
     }
     for subj in SUBJECTS:
         clustering_scores["subj_id"].append(subj)
@@ -770,6 +772,11 @@ def task_presence_clustering(
 
         clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans))
 
+        # silhouette score in terms of separability of original labels, not the clustering labels
+        # using both original features and PCA features
+        clustering_scores["SI"].append(silhouette_score(features, target))
+        clustering_scores["SI_pca"].append(silhouette_score(features_pca, target))
+
         clustering_scores["task"].append(task)
         clustering_scores["run"].append(run)
         clustering_scores["dFC method"].append(measure_name)
@@ -854,6 +861,8 @@ def run_clustering(
             "run": list(),
             "dFC method": list(),
             "Kmeans ARI": list(),
+            "SI": list(),
+            "SI_pca": list(),
         }
 
         clustering_RESULTS = {}
@@ -987,6 +996,8 @@ def task_paradigm_clustering(
             "PCA": pca,
             "kmeans": kmeans,
             "ARI": adjusted_rand_score(y, labels_pred),
+            "SI": silhouette_score(X, y),
+            "SI_pca": silhouette_score(X_pca, y),
             "centroids": centroids_mat,
             "task_paradigms": TASKS,
         }

From 8cfef3fd7dbaaad6db733dc98eb7e50dd3f9ddde Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 29 Jul 2024 15:24:22 -0400
Subject: [PATCH 086/401] change RF to GBT in report

---
 task_dFC/generate_report.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 5c56b02..2271ccf 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1140,14 +1140,14 @@ def create_html_report_group_results(
                     classification_dir = f"{group_dir}/classification"
 
                 # display Random Forest classification results
-                file.write("<h3>Random Forest</h3>\n")
+                file.write("<h3>Gradient Boosting</h3>\n")
                 if run is None:
                     classification_img = (
-                        f"{classification_dir}/ML_results_classify_RF_{task}.png"
+                        f"{classification_dir}/ML_results_classify_GBT_{task}.png"
                     )
                 else:
                     classification_img = (
-                        f"{classification_dir}/ML_results_classify_RF_{task}_{run}.png"
+                        f"{classification_dir}/ML_results_classify_GBT_{task}_{run}.png"
                     )
                 img = plt.imread(classification_img)
                 height, width, _ = img.shape
@@ -1494,10 +1494,10 @@ def create_html_report_group_results(
                         task=task,
                         run=run,
                         session=session,
-                        ML_algorithm="Random Forest",
+                        ML_algorithm="Gradient Boosting",
                     )
                 except Exception as e:
-                    print(f"Error in plotting ML results for RF: {e}")
+                    print(f"Error in plotting ML results for GBT: {e}")
                 try:
                     plot_ML_results(
                         ML_root=ML_root,

From 196b2f13095ce5ee0ad4804502918c3930bb0baf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 29 Jul 2024 18:28:38 -0400
Subject: [PATCH 087/401] change ML from GBT to KNN

---
 task_dFC/ML.py | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index a06e686..3779cdf 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -598,30 +598,30 @@ def task_presence_classification(
     # logistic regression
     log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test)
 
-    # # KNN
-    # KNN_RESULT = KNN_classify(
-    #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
-    # )
+    # KNN
+    KNN_RESULT = KNN_classify(
+        X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
+    )
 
     # # Random Forest
     # RF_RESULT = random_forest_classify(
     #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
     # )
 
-    # Gradient Boosting
-    GBT_RESULT = gradient_boosting_classify(
-        X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
-    )
+    # # Gradient Boosting
+    # GBT_RESULT = gradient_boosting_classify(
+    #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
+    # )
 
     ML_RESULT = {}
     for key in log_reg_RESULT:
         ML_RESULT[key] = log_reg_RESULT[key]
-    # for key in KNN_RESULT:
-    #     ML_RESULT[key] = KNN_RESULT[key]
+    for key in KNN_RESULT:
+        ML_RESULT[key] = KNN_RESULT[key]
     # for key in RF_RESULT:
     #     ML_RESULT[key] = RF_RESULT[key]
-    for key in GBT_RESULT:
-        ML_RESULT[key] = GBT_RESULT[key]
+    # for key in GBT_RESULT:
+    #     ML_RESULT[key] = GBT_RESULT[key]
 
     # measure pred score on each subj
 
@@ -632,14 +632,14 @@ def task_presence_classification(
         "run": list(),
         "dFC method": list(),
         "Logistic regression accuracy": list(),
-        # "KNN accuracy": list(),
+        "KNN accuracy": list(),
         # "Random Forest accuracy": list(),
-        "Gradient Boosting accuracy": list(),
+        # "Gradient Boosting accuracy": list(),
     }
     log_reg = log_reg_RESULT["log_reg_model"]
-    # KNN = KNN_RESULT["KNN_model"]
+    KNN = KNN_RESULT["KNN_model"]
     # RF = RF_RESULT["RF_model"]
-    GBT = GBT_RESULT["GB_model"]
+    # GBT = GBT_RESULT["GB_model"]
 
     for subj in SUBJECTS:
         ML_scores["subj_id"].append(subj)
@@ -653,20 +653,20 @@ def task_presence_classification(
             target = y_test[subj_label_test == subj]
 
         pred_lr = log_reg.predict(features)
-        # pred_KNN = KNN.predict(features)
+        pred_KNN = KNN.predict(features)
         # pred_RF = RF.predict(features)
-        pred_GBT = GBT.predict(features)
+        # pred_GBT = GBT.predict(features)
 
         ML_scores["Logistic regression accuracy"].append(
             balanced_accuracy_score(target, pred_lr)
         )
-        # ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
+        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
         # ML_scores["Random Forest accuracy"].append(
         #     balanced_accuracy_score(target, pred_RF)
         # )
-        ML_scores["Gradient Boosting accuracy"].append(
-            balanced_accuracy_score(target, pred_GBT)
-        )
+        # ML_scores["Gradient Boosting accuracy"].append(
+        #     balanced_accuracy_score(target, pred_GBT)
+        # )
 
         ML_scores["task"].append(task)
         ML_scores["run"].append(run)
@@ -805,9 +805,9 @@ def run_classification(
             "run": list(),
             "dFC method": list(),
             "Logistic regression accuracy": list(),
-            # "KNN accuracy": list(),
+            "KNN accuracy": list(),
             # "Random Forest accuracy": list(),
-            "Gradient Boosting accuracy": list(),
+            # "Gradient Boosting accuracy": list(),
         }
 
         ML_RESULT = {}

From 6d0f1e8f07335ce71b38ae3c18efb617d0204f2b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 29 Jul 2024 18:28:54 -0400
Subject: [PATCH 088/401] minor change

---
 task_dFC/generate_report.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 2271ccf..2002bf6 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -671,11 +671,8 @@ def plot_paradigm_clustering_score(
             f"{input_dir}/{result_file}", allow_pickle="TRUE"
         ).item()
         paradigm_clustering_RESULTS["dFC method"].append(
-            result_file[result_file.find("task_paradigm_clstr_RESULTS_") + 27 : -4]
+            paradigm_clustering_RESULTS_new["dFC_method"]
         )
-        # paradigm_clustering_RESULTS["dFC method"].append(
-        #     paradigm_clustering_RESULTS_new["dFC_method"]
-        # )
         paradigm_clustering_RESULTS["ARI score"].append(
             paradigm_clustering_RESULTS_new["ARI"]
         )
@@ -758,10 +755,7 @@ def plot_paradigm_clstr_centroids(
             f"{input_dir}/{result_file}", allow_pickle="TRUE"
         ).item()
 
-        # measure_name = paradigm_clustering_RESULTS_new["dFC_method"]
-        measure_name = result_file[
-            result_file.find("task_paradigm_clstr_RESULTS_") + 28 : -4
-        ]
+        measure_name = paradigm_clustering_RESULTS_new["dFC_method"]
         centroids_mats = paradigm_clustering_RESULTS_new["centroids"]
 
         centroids_dict = {}

From 6aa93598a6b0aeb3b29920130baeb79fe769e77f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 30 Jul 2024 14:20:15 -0400
Subject: [PATCH 089/401] add manifold learning

---
 task_dFC/ML.py | 309 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 191 insertions(+), 118 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 3779cdf..1db529a 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -3,10 +3,12 @@
 import os
 
 import numpy as np
+from scipy.spatial import procrustes
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
+from sklearn.manifold import SpectralEmbedding
 from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.neighbors import KNeighborsClassifier
@@ -245,6 +247,101 @@ def load_task_data(roi_root, subj, task, run=None, session=None):
     return task_data
 
 
+def embed_dFC_features(
+    train_subjects,
+    test_subjects,
+    X_train,
+    X_test,
+    y_train,
+    y_test,
+    subj_label_train,
+    subj_label_test,
+    embedding="PCA",
+    n_components=30,
+    n_neighbors_LE=90,
+):
+    """
+    Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
+
+    for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects.
+    All the subjects are transformed into the space of the subject with the highest silhouette score.
+    """
+    if embedding == "PCA":
+        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
+        pca.fit(X_train)
+        X_train_embed = pca.transform(X_train)
+        X_test_embed = pca.transform(X_test)
+    elif embedding == "LE":
+        # first embed the dFC features of each subject into a lower dimensional space using LE separately
+        embed_dict = {}
+        for subject in train_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_train == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_train[subj_label_train == subject, :]
+            y_subj = y_train[subj_label_train == subject]
+            LE = SpectralEmbedding(
+                n_components=n_components,
+                n_neighbors=n_neighbors_LE,
+            )
+            X_subj_embed = LE.fit_transform(X_subj)
+            SI = silhouette_score(X_subj_embed, y_subj)
+            embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
+
+        # find the best transformation based on the SI score
+        best_SI = -1
+        best_subject = None
+        for subject in embed_dict:
+            if embed_dict[subject]["SI"] > best_SI:
+                best_SI = embed_dict[subject]["SI"]
+                best_subject = subject
+
+        # apply procrustes transformation to align the embeddings of different subjects
+        # use the embeddings of the subject with the highest SI score as the reference
+        X_train_embed = None
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]["X_subj_embed"]
+            # procrustes transformation
+            if subject == best_subject:
+                X_subj_embed_transformed = X_subj_embed
+            else:
+                _, X_subj_embed_transformed, _ = procrustes(
+                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+                )
+            if X_train_embed is None:
+                X_train_embed = X_subj_embed_transformed
+            else:
+                X_train_embed = np.concatenate(
+                    (X_train_embed, X_subj_embed_transformed), axis=0
+                )
+
+        # apply the same transformation to the test set
+        X_test_embed = None
+        for subject in test_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_test == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_test[subj_label_test == subject, :]
+            LE = SpectralEmbedding(
+                n_components=n_components,
+                n_neighbors=n_neighbors_LE,
+            )
+            X_subj_embed = LE.fit_transform(X_subj)
+            _, X_subj_embed_transformed, _ = procrustes(
+                embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+            )
+            if X_test_embed is None:
+                X_test_embed = X_subj_embed_transformed
+            else:
+                X_test_embed = np.concatenate(
+                    (X_test_embed, X_subj_embed_transformed), axis=0
+                )
+
+    return X_train_embed, X_test_embed
+
+
 def dFC_feature_extraction(
     task,
     train_subjects,
@@ -385,22 +482,13 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
     return RESULT
 
 
-def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95):
+def KNN_classify(X_train, y_train, X_test, y_test):
     """
     KNN classification
     """
-    # find num_PCs
-    pca = PCA(svd_solver="full", whiten=False)
-    pca.fit(X_train)
-    num_PCs = (
-        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
-        + 1
-    )
-    num_PCs = min(num_PCs, 100)
     # create a pipeline with a knn model to find the best n_neighbors
     knn = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         KNeighborsClassifier(),
     )
     # create a dictionary of all values we want to test for n_neighbors
@@ -414,13 +502,10 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95)
 
     neigh = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         KNeighborsClassifier(n_neighbors=n_neighbors),
     ).fit(X_train, y_train)
 
     RESULT = {
-        "KNN_pca": pca,
-        "KNN_num_PCs": num_PCs,
         "KNN_cv_results": knn_gscv.cv_results_,
         "KNN_model": neigh,
         "KNN_train_score": neigh.score(X_train, y_train),
@@ -430,25 +515,13 @@ def KNN_classify(X_train, y_train, X_test, y_test, explained_var_threshold=0.95)
     return RESULT
 
 
-def random_forest_classify(
-    X_train, y_train, X_test, y_test, explained_var_threshold=0.95
-):
+def random_forest_classify(X_train, y_train, X_test, y_test):
     """
     Random Forest classification
     """
-    # find num_PCs
-    pca = PCA(svd_solver="full", whiten=False)
-    pca.fit(X_train)
-    num_PCs = (
-        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
-        + 1
-    )
-    num_PCs = min(num_PCs, 100)
-
     # create a pipeline with a random forest model to find the best n_estimators
     rf = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         RandomForestClassifier(),
     )
     # create a dictionary of all values we want to test for n_estimators
@@ -466,13 +539,10 @@ def random_forest_classify(
 
     rf = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
     ).fit(X_train, y_train)
 
     RESULT = {
-        "RF_pca": pca,
-        "RF_num_PCs": num_PCs,
         "RF_cv_results": rf_gscv.cv_results_,
         "RF_model": rf,
         "RF_train_score": rf.score(X_train, y_train),
@@ -482,25 +552,13 @@ def random_forest_classify(
     return RESULT
 
 
-def gradient_boosting_classify(
-    X_train, y_train, X_test, y_test, explained_var_threshold=0.95
-):
+def gradient_boosting_classify(X_train, y_train, X_test, y_test):
     """
     Gradient Boosting classification
     """
-    # find num_PCs
-    pca = PCA(svd_solver="full", whiten=False)
-    pca.fit(X_train)
-    num_PCs = (
-        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
-        + 1
-    )
-    num_PCs = min(num_PCs, 100)
-
     # create a pipeline with a gradient boosting model to find the best n_estimators
     gb = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         GradientBoostingClassifier(),
     )
     # create a dictionary of all values we want to test for n_estimators
@@ -520,15 +578,12 @@ def gradient_boosting_classify(
 
     gb = make_pipeline(
         StandardScaler(),
-        PCA(n_components=num_PCs, svd_solver="full", whiten=False),
         GradientBoostingClassifier(
             n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate
         ),
     ).fit(X_train, y_train)
 
     RESULT = {
-        "GB_pca": pca,
-        "GB_num_PCs": num_PCs,
         "GB_cv_results": gb_gscv.cv_results_,
         "GB_model": gb,
         "GB_train_score": gb.score(X_train, y_train),
@@ -548,7 +603,6 @@ def task_presence_classification(
     dynamic_pred="no",
     normalize_dFC=True,
     train_test_ratio=0.8,
-    explained_var_threshold=0.95,
 ):
     """
     perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting
@@ -591,6 +645,21 @@ def task_presence_classification(
         )
     )
 
+    # embed dFC features
+    X_train, X_test = embed_dFC_features(
+        train_subjects=train_subjects,
+        test_subjects=test_subjects,
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        subj_label_train=subj_label_train,
+        subj_label_test=subj_label_test,
+        embedding="LE",
+        n_components=30,
+        n_neighbors_LE=90,
+    )
+
     # task presence classification
 
     print("task presence classification ...")
@@ -599,18 +668,16 @@ def task_presence_classification(
     log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test)
 
     # KNN
-    KNN_RESULT = KNN_classify(
-        X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
-    )
+    KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test)
 
     # # Random Forest
     # RF_RESULT = random_forest_classify(
-    #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
+    #     X_train, y_train, X_test, y_test
     # )
 
     # # Gradient Boosting
     # GBT_RESULT = gradient_boosting_classify(
-    #     X_train, y_train, X_test, y_test, explained_var_threshold=explained_var_threshold
+    #     X_train, y_train, X_test, y_test
     # )
 
     ML_RESULT = {}
@@ -683,7 +750,6 @@ def task_presence_clustering(
     run=None,
     session=None,
     normalize_dFC=True,
-    explained_var_threshold=0.95,
 ):
     if run is None:
         print(f"=============== {task} ===============")
@@ -712,44 +778,46 @@ def task_presence_clustering(
         normalize_dFC=normalize_dFC,
     )
 
+    # embed dFC features
+    X, _ = embed_dFC_features(
+        train_subjects=SUBJECTS,
+        test_subjects=[],
+        X_train=X,
+        X_test=None,
+        y_train=y,
+        y_test=None,
+        subj_label_train=subj_label,
+        subj_label_test=None,
+        embedding="LE",
+        n_components=30,
+        n_neighbors_LE=90,
+    )
+
     # clustering
-    # apply kmeans clustering with PCA to dFC features
+    # apply kmeans clustering to dFC features
 
     n_clusters = 2  # corresponding to task and rest
 
     scaler = StandardScaler()
     X_normalized = scaler.fit_transform(X)
-    # PCA
-    # find number of components that explain 95% of variance
-    pca = PCA(svd_solver="full", whiten=False)
-    pca.fit(X_normalized)
-    n_components = (
-        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
-        + 1
-    )
-    n_components = min(n_components, 100)
-    pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
-    X_pca = pca.fit_transform(X_normalized)
     kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-    labels_pred = kmeans.fit_predict(X_pca)
+    labels_pred = kmeans.fit_predict(X_normalized)
 
     # ARI score
     print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
 
-    # visualize clustering centroids
-    centroids = kmeans.cluster_centers_
-    centroids = pca.inverse_transform(centroids)
-    centroids = scaler.inverse_transform(centroids)
-    n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-    centroids_mat = dFC_vec2mat(centroids, n_regions)
+    # # visualize clustering centroids
+    # centroids = kmeans.cluster_centers_
+    # centroids = pca.inverse_transform(centroids)
+    # centroids = scaler.inverse_transform(centroids)
+    # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+    # centroids_mat = dFC_vec2mat(centroids, n_regions)
 
     clustering_RESULTS = {
         "StandardScaler": scaler,
-        "num_PCs": n_components,
-        "PCA": pca,
         "kmeans": kmeans,
         "ARI": adjusted_rand_score(y, labels_pred),
-        "centroids": centroids_mat,
+        # "centroids": centroids_mat,
     }
 
     clustering_scores = {
@@ -759,7 +827,6 @@ def task_presence_clustering(
         "dFC method": list(),
         "Kmeans ARI": list(),
         "SI": list(),
-        "SI_pca": list(),
     }
     for subj in SUBJECTS:
         clustering_scores["subj_id"].append(subj)
@@ -767,15 +834,12 @@ def task_presence_clustering(
         target = y[subj_label == subj]
 
         features_normalized = scaler.transform(features)
-        features_pca = pca.transform(features_normalized)
-        pred_kmeans = kmeans.predict(features_pca)
+        pred_kmeans = kmeans.predict(features_normalized)
 
         clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans))
 
         # silhouette score in terms of separability of original labels, not the clustering labels
-        # using both original features and PCA features
         clustering_scores["SI"].append(silhouette_score(features, target))
-        clustering_scores["SI_pca"].append(silhouette_score(features_pca, target))
 
         clustering_scores["task"].append(task)
         clustering_scores["run"].append(run)
@@ -862,7 +926,6 @@ def run_clustering(
             "dFC method": list(),
             "Kmeans ARI": list(),
             "SI": list(),
-            "SI_pca": list(),
         }
 
         clustering_RESULTS = {}
@@ -905,7 +968,6 @@ def task_paradigm_clustering(
     dFC_root,
     output_root,
     normalize_dFC=True,
-    explained_var_threshold=0.95,
 ):
     for session in SESSIONS:
         # find SUBJECTS common to all tasks
@@ -923,20 +985,23 @@ def task_paradigm_clustering(
 
         X = None
         y = None
+        subj_label = None
         measure_name = None
         for task_id, task in enumerate(TASKS):
             for run in RUNS[task]:
-                X_new, _, _, _, _, _, measure_name_new = dFC_feature_extraction(
-                    task=task,
-                    train_subjects=SUBJECTS,
-                    test_subjects=[],
-                    dFC_id=dFC_id,
-                    roi_root=roi_root,
-                    dFC_root=dFC_root,
-                    run=run,
-                    session=session,
-                    dynamic_pred="no",
-                    normalize_dFC=normalize_dFC,
+                X_new, _, _, _, subj_label_new, _, measure_name_new = (
+                    dFC_feature_extraction(
+                        task=task,
+                        train_subjects=SUBJECTS,
+                        test_subjects=[],
+                        dFC_id=dFC_id,
+                        roi_root=roi_root,
+                        dFC_root=dFC_root,
+                        run=run,
+                        session=session,
+                        dynamic_pred="no",
+                        normalize_dFC=normalize_dFC,
+                    )
                 )
 
                 if measure_name is not None:
@@ -950,55 +1015,63 @@ def task_paradigm_clustering(
                 if X is None and y is None:
                     X = X_new
                     y = y_new
+                    subj_label = subj_label_new
                 else:
                     X = np.concatenate((X, X_new), axis=0)
                     y = np.concatenate((y, y_new), axis=0)
+                    subj_label = np.concatenate((subj_label, subj_label_new), axis=0)
 
         assert X.shape[0] == y.shape[0], "Number of samples do not match."
+        assert X.shape[0] == subj_label.shape[0], "Number of samples do not match."
+
+        # rearrange the order of the samples so that the samples of the same subject are together
+        idx = np.argsort(subj_label)
+        X = X[idx, :]
+        y = y[idx]
+        subj_label = subj_label[idx]
+
+        # embed dFC features
+        X, _ = embed_dFC_features(
+            train_subjects=SUBJECTS,
+            test_subjects=[],
+            X_train=X,
+            X_test=None,
+            y_train=y,
+            y_test=None,
+            subj_label_train=subj_label,
+            subj_label_test=None,
+            embedding="LE",
+            n_components=30,
+            n_neighbors_LE=90,
+        )
 
         # clustering
-        # apply kmeans clustering with PCA to dFC features
+        # apply kmeans clustering to dFC features
 
         n_clusters = len(TASKS)  # corresponding to task paradigms
 
         scaler = StandardScaler()
         X_normalized = scaler.fit_transform(X)
-        # PCA
-        # find number of components that explain 95% of variance
-        pca = PCA(svd_solver="full", whiten=False)
-        pca.fit(X_normalized)
-        n_components = (
-            np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[
-                0
-            ][0]
-            + 1
-        )
-        n_components = min(n_components, 100)
-        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
-        X_pca = pca.fit_transform(X_normalized)
         kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-        labels_pred = kmeans.fit_predict(X_pca)
+        labels_pred = kmeans.fit_predict(X_normalized)
 
         # ARI score
         print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
 
-        # visualize clustering centroids
-        centroids = kmeans.cluster_centers_
-        centroids = pca.inverse_transform(centroids)
-        centroids = scaler.inverse_transform(centroids)
-        n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-        centroids_mat = dFC_vec2mat(centroids, n_regions)
+        # # visualize clustering centroids
+        # centroids = kmeans.cluster_centers_
+        # centroids = pca.inverse_transform(centroids)
+        # centroids = scaler.inverse_transform(centroids)
+        # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+        # centroids_mat = dFC_vec2mat(centroids, n_regions)
 
         task_paradigm_clstr_RESULTS = {
             "dFC_method": measure_name,
             "StandardScaler": scaler,
-            "num_PCs": n_components,
-            "PCA": pca,
             "kmeans": kmeans,
             "ARI": adjusted_rand_score(y, labels_pred),
-            "SI": silhouette_score(X, y),
-            "SI_pca": silhouette_score(X_pca, y),
-            "centroids": centroids_mat,
+            "SI": silhouette_score(X_normalized, y),
+            # "centroids": centroids_mat,
             "task_paradigms": TASKS,
         }
 

From bad1003f7bdaa0122faeaa958d12755221140794 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 30 Jul 2024 16:11:58 -0400
Subject: [PATCH 090/401] minor change

---
 task_dFC/ML.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 1db529a..a8d000e 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import os
+import traceback
 
 import numpy as np
 from scipy.spatial import procrustes
@@ -258,7 +259,7 @@ def embed_dFC_features(
     subj_label_test,
     embedding="PCA",
     n_components=30,
-    n_neighbors_LE=90,
+    n_neighbors_LE=100,
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
@@ -657,7 +658,7 @@ def task_presence_classification(
         subj_label_test=subj_label_test,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=90,
+        n_neighbors_LE=100,
     )
 
     # task presence classification
@@ -790,7 +791,7 @@ def task_presence_clustering(
         subj_label_test=None,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=90,
+        n_neighbors_LE=100,
     )
 
     # clustering
@@ -1042,7 +1043,7 @@ def task_paradigm_clustering(
             subj_label_test=None,
             embedding="LE",
             n_components=30,
-            n_neighbors_LE=90,
+            n_neighbors_LE=100,
         )
 
         # clustering
@@ -1174,6 +1175,7 @@ def task_paradigm_clustering(
         )
     except Exception as e:
         print(f"Error in classification for dFC ID {dFC_id}: {e}")
+        traceback.print_exc()
     print(f"Task presence classification finished for dFC ID {dFC_id}.")
     print(f"Task presence clustering started for dFC ID {dFC_id} ...")
     try:
@@ -1189,6 +1191,7 @@ def task_paradigm_clustering(
         )
     except Exception as e:
         print(f"Error in clustering for dFC ID {dFC_id}: {e}")
+        traceback.print_exc()
 
     print(f"Task presence clustering finished for dFC ID {dFC_id}.")
 
@@ -1206,6 +1209,7 @@ def task_paradigm_clustering(
         )
     except Exception as e:
         print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}")
+        traceback.print_exc()
 
     print(f"Task paradigm clustering finished for dFC ID {dFC_id}.")
     print(f"Task presence prediction finished for dFC ID {dFC_id}.")

From 868cbea000418acaaa329c621c6934c2fbd820a4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 30 Jul 2024 16:51:01 -0400
Subject: [PATCH 091/401] change in LE

---
 task_dFC/ML.py | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index a8d000e..9a6c0f6 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -259,7 +259,7 @@ def embed_dFC_features(
     subj_label_test,
     embedding="PCA",
     n_components=30,
-    n_neighbors_LE=100,
+    n_neighbors_LE=110,
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
@@ -307,8 +307,37 @@ def embed_dFC_features(
             if subject == best_subject:
                 X_subj_embed_transformed = X_subj_embed
             else:
+                # for the procrustes transformation, the number of samples should be the same
+                if (
+                    X_subj_embed.shape[0]
+                    > embed_dict[best_subject]["X_subj_embed"].shape[0]
+                ):
+                    # add zero rows to the embedding of the best subject
+                    X_best_subj_embed = np.concatenate(
+                        (
+                            embed_dict[best_subject]["X_subj_embed"],
+                            np.zeros(
+                                (
+                                    X_subj_embed.shape[0]
+                                    - embed_dict[best_subject]["X_subj_embed"].shape[0],
+                                    n_components,
+                                )
+                            ),
+                        ),
+                        axis=0,
+                    )
+                elif (
+                    X_subj_embed.shape[0]
+                    < embed_dict[best_subject]["X_subj_embed"].shape[0]
+                ):
+                    # remove extra rows from the embedding of the best subject
+                    X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"][
+                        : X_subj_embed.shape[0], :
+                    ]
+                else:
+                    X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"]
                 _, X_subj_embed_transformed, _ = procrustes(
-                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+                    X_best_subj_embed, X_subj_embed
                 )
             if X_train_embed is None:
                 X_train_embed = X_subj_embed_transformed
@@ -658,7 +687,7 @@ def task_presence_classification(
         subj_label_test=subj_label_test,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=100,
+        n_neighbors_LE=110,
     )
 
     # task presence classification
@@ -791,7 +820,7 @@ def task_presence_clustering(
         subj_label_test=None,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=100,
+        n_neighbors_LE=110,
     )
 
     # clustering
@@ -1043,7 +1072,7 @@ def task_paradigm_clustering(
             subj_label_test=None,
             embedding="LE",
             n_components=30,
-            n_neighbors_LE=100,
+            n_neighbors_LE=110,
         )
 
         # clustering

From 6f2f831f18d6d032aa3f64a6483f4c5f8459571b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 30 Jul 2024 19:57:27 -0400
Subject: [PATCH 092/401] minor change

---
 task_dFC/ML.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 9a6c0f6..eace7ea 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -259,7 +259,7 @@ def embed_dFC_features(
     subj_label_test,
     embedding="PCA",
     n_components=30,
-    n_neighbors_LE=110,
+    n_neighbors_LE=150,
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
@@ -687,7 +687,7 @@ def task_presence_classification(
         subj_label_test=subj_label_test,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=110,
+        n_neighbors_LE=150,
     )
 
     # task presence classification
@@ -820,7 +820,7 @@ def task_presence_clustering(
         subj_label_test=None,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=110,
+        n_neighbors_LE=150,
     )
 
     # clustering
@@ -1072,7 +1072,7 @@ def task_paradigm_clustering(
             subj_label_test=None,
             embedding="LE",
             n_components=30,
-            n_neighbors_LE=110,
+            n_neighbors_LE=150,
         )
 
         # clustering

From a6a630e60c0cb5dfcd00d2bf3eb82c9120b2e55a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 15:13:08 -0400
Subject: [PATCH 093/401] change l2 to l1 in logreg

---
 task_dFC/ML.py | 78 ++++++++++++++++++++++++++++----------------------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index eace7ea..dad5296 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -248,6 +248,36 @@ def load_task_data(roi_root, subj, task, run=None, session=None):
     return task_data
 
 
+def precheck_for_procruste(X_best, X_subj):
+    """
+    Check if the two matrices have the same number of rows. if not, make them the same.
+    """
+    # for the procrustes transformation, the number of samples should be the same
+    if X_subj.shape[0] > X_best.shape[0]:
+        # add zero rows to the embedding of the best subject
+        X_best_new = np.concatenate(
+            (
+                X_best,
+                np.zeros(
+                    (
+                        X_subj.shape[0] - X_best.shape[0],
+                        X_best.shape[1],
+                    )
+                ),
+            ),
+            axis=0,
+        )
+    elif X_subj.shape[0] < X_best.shape[0]:
+        # remove extra rows from the embedding of the best subject
+        X_best_new = X_best[: X_subj.shape[0], :]
+    else:
+        X_best_new = X_best
+
+    X_best_new = X_best_new.copy()
+
+    return X_best_new
+
+
 def embed_dFC_features(
     train_subjects,
     test_subjects,
@@ -259,7 +289,7 @@ def embed_dFC_features(
     subj_label_test,
     embedding="PCA",
     n_components=30,
-    n_neighbors_LE=150,
+    n_neighbors_LE=125,
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
@@ -308,34 +338,9 @@ def embed_dFC_features(
                 X_subj_embed_transformed = X_subj_embed
             else:
                 # for the procrustes transformation, the number of samples should be the same
-                if (
-                    X_subj_embed.shape[0]
-                    > embed_dict[best_subject]["X_subj_embed"].shape[0]
-                ):
-                    # add zero rows to the embedding of the best subject
-                    X_best_subj_embed = np.concatenate(
-                        (
-                            embed_dict[best_subject]["X_subj_embed"],
-                            np.zeros(
-                                (
-                                    X_subj_embed.shape[0]
-                                    - embed_dict[best_subject]["X_subj_embed"].shape[0],
-                                    n_components,
-                                )
-                            ),
-                        ),
-                        axis=0,
-                    )
-                elif (
-                    X_subj_embed.shape[0]
-                    < embed_dict[best_subject]["X_subj_embed"].shape[0]
-                ):
-                    # remove extra rows from the embedding of the best subject
-                    X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"][
-                        : X_subj_embed.shape[0], :
-                    ]
-                else:
-                    X_best_subj_embed = embed_dict[best_subject]["X_subj_embed"]
+                X_best_subj_embed = precheck_for_procruste(
+                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+                )
                 _, X_subj_embed_transformed, _ = procrustes(
                     X_best_subj_embed, X_subj_embed
                 )
@@ -359,9 +364,12 @@ def embed_dFC_features(
                 n_neighbors=n_neighbors_LE,
             )
             X_subj_embed = LE.fit_transform(X_subj)
-            _, X_subj_embed_transformed, _ = procrustes(
+            # procrustes transformation
+            # for the procrustes transformation, the number of samples should be the same
+            X_best_subj_embed = precheck_for_procruste(
                 embed_dict[best_subject]["X_subj_embed"], X_subj_embed
             )
+            _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed)
             if X_test_embed is None:
                 X_test_embed = X_subj_embed_transformed
             else:
@@ -487,7 +495,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
     Logistic regression classification
     """
     # create a pipeline with a logistic regression model to find the best C
-    logistic_reg = make_pipeline(StandardScaler(), LogisticRegression())
+    logistic_reg = make_pipeline(StandardScaler(), LogisticRegression(penalty="l1"))
     # create a dictionary of all values we want to test for C
     param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
     # use gridsearch to test all values for C
@@ -499,7 +507,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
 
     log_reg = make_pipeline(
         StandardScaler(),
-        LogisticRegression(C=C),
+        LogisticRegression(penalty="l1", C=C),
     ).fit(X_train, y_train)
 
     RESULT = {
@@ -687,7 +695,7 @@ def task_presence_classification(
         subj_label_test=subj_label_test,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=150,
+        n_neighbors_LE=125,
     )
 
     # task presence classification
@@ -820,7 +828,7 @@ def task_presence_clustering(
         subj_label_test=None,
         embedding="LE",
         n_components=30,
-        n_neighbors_LE=150,
+        n_neighbors_LE=125,
     )
 
     # clustering
@@ -1072,7 +1080,7 @@ def task_paradigm_clustering(
             subj_label_test=None,
             embedding="LE",
             n_components=30,
-            n_neighbors_LE=150,
+            n_neighbors_LE=125,
         )
 
         # clustering

From 039a9e848f3cef84700d84190f3a3b5abe544d6d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 15:33:59 -0400
Subject: [PATCH 094/401] minor change

---
 task_dFC/ML.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index dad5296..41fb0ff 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -495,7 +495,9 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
     Logistic regression classification
     """
     # create a pipeline with a logistic regression model to find the best C
-    logistic_reg = make_pipeline(StandardScaler(), LogisticRegression(penalty="l1"))
+    logistic_reg = make_pipeline(
+        StandardScaler(), LogisticRegression(penalty="l1", solver="saga")
+    )
     # create a dictionary of all values we want to test for C
     param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
     # use gridsearch to test all values for C
@@ -507,7 +509,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
 
     log_reg = make_pipeline(
         StandardScaler(),
-        LogisticRegression(penalty="l1", C=C),
+        LogisticRegression(penalty="l1", C=C, solver="saga"),
     ).fit(X_train, y_train)
 
     RESULT = {

From 13a36b995d665efb709cc3281b43f347ef3323e6 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 15:41:36 -0400
Subject: [PATCH 095/401] minor fix

---
 task_dFC/ML.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 41fb0ff..eb35ccd 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -314,7 +314,7 @@ def embed_dFC_features(
             y_subj = y_train[subj_label_train == subject]
             LE = SpectralEmbedding(
                 n_components=n_components,
-                n_neighbors=n_neighbors_LE,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
             )
             X_subj_embed = LE.fit_transform(X_subj)
             SI = silhouette_score(X_subj_embed, y_subj)
@@ -361,7 +361,7 @@ def embed_dFC_features(
             X_subj = X_test[subj_label_test == subject, :]
             LE = SpectralEmbedding(
                 n_components=n_components,
-                n_neighbors=n_neighbors_LE,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
             )
             X_subj_embed = LE.fit_transform(X_subj)
             # procrustes transformation

From e369e2170c6a19b92c5d8ee8644ce4e4d1bd4481 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 18:01:20 -0400
Subject: [PATCH 096/401] change nifti roi so it can handle common events files

---
 task_dFC/nifti_to_roi_signal.py | 77 ++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 0d65049..46c0c66 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -84,52 +84,57 @@ def run_roi_signal_extraction(
         num_time_mri = time_series.n_time
         ################################# EXTRACT TASK LABELS #########################
         oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
-        if task == "task-restingstate":
-            events = []
-            event_types = ["rest"]
-            event_labels = np.zeros((int(num_time_mri * oversampling), 1))
-            task_labels = np.zeros((int(num_time_mri * oversampling), 1))
-            Fs_task = float(1 / TR_mri) * oversampling
-        else:
-            ALL_EVENTS_FILES = os.listdir(task_events_root)
+
+        ALL_EVENTS_FILES = os.listdir(task_events_root)
+        ALL_EVENTS_FILES = [
+            file_i
+            for file_i in ALL_EVENTS_FILES
+            if (f"{subj}_" in file_i)
+            and (f"_{task}_" in file_i)
+            and ("events.tsv" in file_i)
+        ]
+        if not run is None:
+            ALL_EVENTS_FILES = [
+                file_i for file_i in ALL_EVENTS_FILES if f"_{run}_" in file_i
+            ]
+        if not session is None:
             ALL_EVENTS_FILES = [
+                file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i
+            ]
+
+        if not len(ALL_EVENTS_FILES) == 1:
+            # in some cases the event file is common for all subjects and can be found in f"{main_root}/bids"
+            ALL_EVENTS_FILES_COMMON = os.listdir(f"{main_root}/bids/")
+            ALL_EVENTS_FILES_COMMON = [
                 file_i
-                for file_i in ALL_EVENTS_FILES
-                if (f"{subj}_" in file_i)
-                and (f"_{task}_" in file_i)
-                and ("events.tsv" in file_i)
+                for file_i in ALL_EVENTS_FILES_COMMON
+                if (f"{task}_" in file_i) and ("events.tsv" in file_i)
             ]
-            if not run is None:
-                ALL_EVENTS_FILES = [
-                    file_i for file_i in ALL_EVENTS_FILES if f"_{run}_" in file_i
-                ]
-            if not session is None:
-                ALL_EVENTS_FILES = [
-                    file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i
-                ]
-            if not len(ALL_EVENTS_FILES) == 1:
+            if len(ALL_EVENTS_FILES_COMMON) == 1:
+                events_file = f"{main_root}/bids/{ALL_EVENTS_FILES_COMMON[0]}"
+            else:
                 # if the events file is not found, exclude the subject
                 if run is None:
                     print(f"Events file not found for {subj} {session_str} {task}")
                 else:
                     print(f"Events file not found for {subj} {session_str} {task} {run}")
                 return
-            # load the tsv events file
+        else:
             events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
-            events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
-            # get the event labels
-            event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
-                events=events,
-                TR_mri=TR_mri,
-                num_time_mri=num_time_mri,
-                event_types=None,
-                oversampling=oversampling,
-                return_0_1=False,
-            )
-            # fill task labels with task's index
-            task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(
-                task
-            )
+
+        # load the tsv events file
+        events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
+        # get the event labels
+        event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
+            events=events,
+            TR_mri=TR_mri,
+            num_time_mri=num_time_mri,
+            event_types=None,
+            oversampling=oversampling,
+            return_0_1=False,
+        )
+        # fill task labels with task's index
+        task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(task)
         ################################# SAVE #################################
         # save the ROI time series and task data
         task_data = {

From 007c81fb7bf43befcee3a343b03a5399ea01896b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 18:25:47 -0400
Subject: [PATCH 097/401] handle events files with diff trial type and rest
 labels

---
 pydfc/task_utils.py                    | 54 ++++++++++++++++----------
 task_dFC/nifti_to_roi_signal.py        |  9 +++++
 task_dFC/run_scripts/dataset_info.json |  2 +
 3 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index a807da1..4dedc52 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -19,19 +19,29 @@
 
 
 def events_time_to_labels(
-    events, TR_mri, num_time_mri, event_types=None, oversampling=50, return_0_1=False
+    events,
+    TR_mri,
+    num_time_mri,
+    event_types=None,
+    oversampling=50,
+    trial_type_label="trial_type",
+    rest_labels=["rest", "Rest"],
+    return_0_1=False,
 ):
     """
     event_types is a list of event types to be considered. If None, it will found based on events.
     Assigns the longest event in each TR to that TR (in the interval from last TR to current TR).
     It assumes that the first time point is TR0 which corresponds to [0 sec, TR sec] interval.
     oversampling: number of samples per TR_mri to improve the time resolution of tasks
+
+    if trial_type_label is None, we use event type "unknown" as the trial type
     """
 
     # find which column is the "onset" in the first row
     onset_idx = np.where(events[0, :] == "onset")[0][0]
     duration_idx = np.where(events[0, :] == "duration")[0][0]
-    trial_type_idx = np.where(events[0, :] == "trial_type")[0][0]
+    if trial_type_label is not None:
+        trial_type_idx = np.where(events[0, :] == trial_type_label)[0][0]
 
     assert (
         events[0, onset_idx] == "onset"
@@ -39,19 +49,21 @@ def events_time_to_labels(
     assert (
         events[0, duration_idx] == "duration"
     ), "Something went wrong with the events file! The duration column was not found!"
-    assert (
-        events[0, trial_type_idx] == "trial_type"
-    ), "Something went wrong with the events file! The trial_type column was not found!"
+    if trial_type_label is not None:
+        assert (
+            events[0, trial_type_idx] == trial_type_label
+        ), "Something went wrong with the events file! The trial_type column was not found!"
 
     if event_types is None:
-        event_types = list(np.unique(events[1:, trial_type_idx]))
-        # if rest is already there, remove it
-        if "rest" in event_types:
-            warnings.warn("rest is already in the event types")
-            event_types.remove("rest")
-        if "Rest" in event_types:
-            warnings.warn("Rest is already in the event types")
-            event_types.remove("Rest")
+        if trial_type_label is None:
+            event_types = ["unknown"]
+        else:
+            event_types = list(np.unique(events[1:, trial_type_idx]))
+            # remove all the rest labels
+            for rest_label in rest_labels:
+                if rest_label in event_types:
+                    event_types.remove(rest_label)
+        # add the rest label to the beginning for consistency
         event_types = ["rest"] + event_types
 
     Fs = float(1 / TR_mri) * oversampling
@@ -62,18 +74,20 @@ def events_time_to_labels(
         if i == 0:
             continue
 
-        if events[i, trial_type_idx] in event_types:
-            if ("rest" in events[i, trial_type_idx]) or (
-                "Rest" in events[i, trial_type_idx]
-            ):
+        if trial_type_label is None:
+            trial_type = "unknown"
+        else:
+            trial_type = events[i, trial_type_idx]
+
+        if trial_type in event_types:
+            # the only rest label that is left in event types is "rest" but we don't want to consider it
+            if trial_type == "rest":
                 continue
             start_time = float(events[i, onset_idx])
             end_time = float(events[i, onset_idx]) + float(events[i, duration_idx])
             start_timepoint = int(np.rint(start_time * Fs))
             end_timepoint = int(np.rint(end_time * Fs))
-            event_labels[start_timepoint:end_timepoint] = event_types.index(
-                events[i, trial_type_idx]
-            )
+            event_labels[start_timepoint:end_timepoint] = event_types.index(trial_type)
 
     if return_0_1:
         event_labels = np.multiply(event_labels != 0, 1)
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 46c0c66..3953865 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -21,6 +21,8 @@ def run_roi_signal_extraction(
     output_root,
     session=None,
     RUNS=[None],
+    trial_type_label="trial_type",
+    rest_labels=[],
 ):
     """
     Extract ROI signals and task labels for a given subject and task
@@ -131,6 +133,8 @@ def run_roi_signal_extraction(
             num_time_mri=num_time_mri,
             event_types=None,
             oversampling=oversampling,
+            trial_type_label=trial_type_label,
+            rest_labels=rest_labels,
             return_0_1=False,
         )
         # fill task labels with task's index
@@ -226,6 +230,9 @@ def run_roi_signal_extraction(
     else:
         output_root = dataset_info["roi_root"]
 
+    trial_type_label = dataset_info["trial_type_label"]
+    rest_labels = dataset_info["rest_labels"]
+
     for session in SESSIONS:
         for task in TASKS:
             run_roi_signal_extraction(
@@ -237,6 +244,8 @@ def run_roi_signal_extraction(
                 output_root=output_root,
                 session=session,
                 RUNS=RUNS[task],
+                trial_type_label=trial_type_label,
+                rest_labels=rest_labels,
             )
 
     print(
diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts/dataset_info.json
index 8296d5b..16d775e 100644
--- a/task_dFC/run_scripts/dataset_info.json
+++ b/task_dFC/run_scripts/dataset_info.json
@@ -7,6 +7,8 @@
 	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
 	"ML_root" : "{main_root}/derivatives/ML",
 	"reports_root" : "{main_root}/derivatives/reports",
+	"trial_type_label" : "trial_type",
+	"rest_labels" : ["rest", "Rest"],
 	"bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz",
 	"SESSIONS" : [
 		"ses-1"

From da75c996435b900df3bdce697afa7bc2948d1d47 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 23:35:35 -0400
Subject: [PATCH 098/401] add SI to report

---
 task_dFC/generate_report.py | 551 ++++++++++++++++++++++--------------
 1 file changed, 336 insertions(+), 215 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 2002bf6..4d1bdae 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -591,6 +591,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
     if run is not None:
         dataframe = dataframe[dataframe["run"] == run]
 
+    # plot ARI score
     plt.figure(figsize=(10, 5))
     g = sns.pointplot(
         data=dataframe[dataframe["task"] == task],
@@ -616,7 +617,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
 
     if run is None:
         plt.savefig(
-            f"{output_dir}/clustering_results_{task}.{save_fig_format}",
+            f"{output_dir}/clustering_results_ARI_{task}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -624,7 +625,49 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         )
     else:
         plt.savefig(
-            f"{output_dir}/clustering_results_{task}_{run}.{save_fig_format}",
+            f"{output_dir}/clustering_results_ARI_{task}_{run}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+
+    plt.close()
+
+    # plot SI score
+    plt.figure(figsize=(10, 5))
+    g = sns.pointplot(
+        data=dataframe[dataframe["task"] == task],
+        x="dFC method",
+        y="SI",
+        errorbar="sd",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
+
+    if show_title:
+        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+    # save the figure
+    if session is None:
+        output_dir = f"{output_root}/group_results/clustering"
+    else:
+        output_dir = f"{output_root}/group_results/clustering/{session}"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if run is None:
+        plt.savefig(
+            f"{output_dir}/clustering_results_SI_{task}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+    else:
+        plt.savefig(
+            f"{output_dir}/clustering_results_SI_{task}_{run}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -665,6 +708,7 @@ def plot_paradigm_clustering_score(
     paradigm_clustering_RESULTS = {
         "dFC method": [],
         "ARI score": [],
+        "SI score": [],
     }
     for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
         paradigm_clustering_RESULTS_new = np.load(
@@ -676,6 +720,9 @@ def plot_paradigm_clustering_score(
         paradigm_clustering_RESULTS["ARI score"].append(
             paradigm_clustering_RESULTS_new["ARI"]
         )
+        paradigm_clustering_RESULTS["SI score"].append(
+            paradigm_clustering_RESULTS_new["SI"]
+        )
 
     sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
 
@@ -683,6 +730,7 @@ def plot_paradigm_clustering_score(
 
     dataframe = pd.DataFrame(paradigm_clustering_RESULTS)
 
+    # plot ARI score
     plt.figure(figsize=(10, 5))
     g = sns.pointplot(
         data=dataframe,
@@ -709,7 +757,7 @@ def plot_paradigm_clustering_score(
         os.makedirs(output_dir)
 
     plt.savefig(
-        f"{output_dir}/paradigm_clustering_results.{save_fig_format}",
+        f"{output_dir}/paradigm_clustering_results_ARI.{save_fig_format}",
         dpi=fig_dpi,
         bbox_inches=fig_bbox_inches,
         pad_inches=fig_pad,
@@ -718,155 +766,191 @@ def plot_paradigm_clustering_score(
 
     plt.close()
 
+    # plot SI score
+    plt.figure(figsize=(10, 5))
+    g = sns.pointplot(
+        data=dataframe,
+        x="dFC method",
+        y="SI score",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
 
-def plot_paradigm_clstr_centroids(
-    ML_root,
-    output_root,
-    session=None,
-):
-    """ """
-    # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
-    # find all the paradigm_clustering_RESULTS files in the directory
-    if session is None:
-        input_dir = f"{ML_root}"
-    else:
-        input_dir = f"{ML_root}/{session}"
+    if show_title:
+        g.set_title(
+            "Task Paradigm Clustering Performance",
+            fontdict={"fontsize": 10, "fontweight": "bold"},
+        )
 
+    # save the figure
     if session is None:
-        output_dir = f"{output_root}/group_results/paradigm_clustering_centroids"
+        output_dir = f"{output_root}/group_results/paradigm_clustering"
     else:
-        output_dir = (
-            f"{output_root}/group_results/paradigm_clustering_centroids/{session}"
-        )
+        output_dir = f"{output_root}/group_results/paradigm_clustering/{session}"
 
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
-    ALL_PARADIGM_CLUSTERING_RESULTS = [
-        result_file
-        for result_file in ALL_PARADIGM_CLUSTERING_RESULTS
-        if "task_paradigm_clstr_RESULTS_" in result_file
-    ]
-    ALL_PARADIGM_CLUSTERING_RESULTS.sort()
-
-    for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
-        paradigm_clustering_RESULTS_new = np.load(
-            f"{input_dir}/{result_file}", allow_pickle="TRUE"
-        ).item()
-
-        measure_name = paradigm_clustering_RESULTS_new["dFC_method"]
-        centroids_mats = paradigm_clustering_RESULTS_new["centroids"]
-
-        centroids_dict = {}
-        for i, centroid_mat in enumerate(centroids_mats):
-            centroids_dict[f"Cluster {i + 1}"] = centroid_mat
-
-        visualize_conn_mat_dict(
-            data=centroids_dict,
-            title=f"Task Paradigm Centroids {measure_name}",
-            cmap="seismic",
-            normalize=True,
-            disp_diag=False,
-            save_image=True,
-            output_root=f"{output_dir}/",
-            center_0=True,
-            # node_networks=None,
-        )
-
-
-def plot_dFC_clustering(
-    dFC_root,
-    subj,
-    task,
-    start_time,
-    end_time,
-    output_root,
-    run=None,
-    session=None,
-    normalize_dFC=True,
-):
-    task_data = load_task_data(roi_root, subj, task, run, session)
-    TR_mri = task_data["TR_mri"]
-
-    for dFC_id in range(
-        0, 20
-    ):  # change this to the number of dFCs you have or right a function that finds available dFC ids
-        try:
-            dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
-        except Exception:
-            pass
-
-        dFC_mat = dFC.get_dFC_mat()
-        TR_array = dFC.TR_array
-        if normalize_dFC:
-            dFC_mat = rank_norm(dFC_mat)
-        dFC_vecs = dFC_mat2vec(dFC_mat)
-
-        if session is None:
-            clustering_RESULTS = np.load(
-                f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-        else:
-            clustering_RESULTS = np.load(
-                f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy",
-                allow_pickle="TRUE",
-            ).item()
+    plt.savefig(
+        f"{output_dir}/paradigm_clustering_results_SI.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
 
-        if run is None:
-            scaler = clustering_RESULTS[task]["StandardScaler"]
-            pca = clustering_RESULTS[task]["PCA"]
-            kmeans = clustering_RESULTS[task]["kmeans"]
-        else:
-            scaler = clustering_RESULTS[task][run]["StandardScaler"]
-            pca = clustering_RESULTS[task][run]["PCA"]
-            kmeans = clustering_RESULTS[task][run]["kmeans"]
-
-        dFC_vecs_normalized = scaler.transform(dFC_vecs)
-        dFC_vecs_pca = pca.transform(dFC_vecs_normalized)
-        cluster_labels = kmeans.predict(dFC_vecs_pca)
-
-        start_TR = int(start_time / TR_mri)
-        end_TR = int(end_time / TR_mri)
-
-        start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
-        end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
-
-        fig_width = int(2.5 * (end_time - start_time) / 2)
-        fig_width = min(fig_width, 500)
-        plt.figure(figsize=(fig_width, 5))
-        time = TR_array[start_TR_idx:end_TR_idx] * TR_mri
-        plt.plot(
-            time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4
-        )
-        # put vertical lines at the start of each TR
-        for t in time:
-            plt.axvline(x=t, color="r", linestyle="--")
-            # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center')
-        plt.title(f"Cluster labels of {dFC.measure.measure_name}")
-        plt.xlabel("Time (s)")
+    plt.close()
 
-        # save the figure
-        output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering"
-        if session is not None:
-            output_dir = f"{output_dir}/{session}"
-        output_dir = f"{output_dir}/{task}"
-        if run is not None:
-            output_dir = f"{output_dir}/{run}"
-        output_dir = f"{output_dir}/"
 
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
+# def plot_paradigm_clstr_centroids(
+#     ML_root,
+#     output_root,
+#     session=None,
+# ):
+#     """ """
+#     # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
+#     # find all the paradigm_clustering_RESULTS files in the directory
+#     if session is None:
+#         input_dir = f"{ML_root}"
+#     else:
+#         input_dir = f"{ML_root}/{session}"
 
-        plt.savefig(
-            f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
+#     if session is None:
+#         output_dir = f"{output_root}/group_results/paradigm_clustering_centroids"
+#     else:
+#         output_dir = (
+#             f"{output_root}/group_results/paradigm_clustering_centroids/{session}"
+#         )
+
+#     if not os.path.exists(output_dir):
+#         os.makedirs(output_dir)
+
+#     ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
+#     ALL_PARADIGM_CLUSTERING_RESULTS = [
+#         result_file
+#         for result_file in ALL_PARADIGM_CLUSTERING_RESULTS
+#         if "task_paradigm_clstr_RESULTS_" in result_file
+#     ]
+#     ALL_PARADIGM_CLUSTERING_RESULTS.sort()
+
+#     for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
+#         paradigm_clustering_RESULTS_new = np.load(
+#             f"{input_dir}/{result_file}", allow_pickle="TRUE"
+#         ).item()
+
+#         measure_name = paradigm_clustering_RESULTS_new["dFC_method"]
+#         centroids_mats = paradigm_clustering_RESULTS_new["centroids"]
+
+#         centroids_dict = {}
+#         for i, centroid_mat in enumerate(centroids_mats):
+#             centroids_dict[f"Cluster {i + 1}"] = centroid_mat
+
+#         visualize_conn_mat_dict(
+#             data=centroids_dict,
+#             title=f"Task Paradigm Centroids {measure_name}",
+#             cmap="seismic",
+#             normalize=True,
+#             disp_diag=False,
+#             save_image=True,
+#             output_root=f"{output_dir}/",
+#             center_0=True,
+#             # node_networks=None,
+#         )
+
+
+# def plot_dFC_clustering(
+#     dFC_root,
+#     subj,
+#     task,
+#     start_time,
+#     end_time,
+#     output_root,
+#     run=None,
+#     session=None,
+#     normalize_dFC=True,
+# ):
+#     task_data = load_task_data(roi_root, subj, task, run, session)
+#     TR_mri = task_data["TR_mri"]
+
+#     for dFC_id in range(
+#         0, 20
+#     ):  # change this to the number of dFCs you have or right a function that finds available dFC ids
+#         try:
+#             dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
+#         except Exception:
+#             pass
+
+#         dFC_mat = dFC.get_dFC_mat()
+#         TR_array = dFC.TR_array
+#         if normalize_dFC:
+#             dFC_mat = rank_norm(dFC_mat)
+#         dFC_vecs = dFC_mat2vec(dFC_mat)
+
+#         if session is None:
+#             clustering_RESULTS = np.load(
+#                 f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE"
+#             ).item()
+#         else:
+#             clustering_RESULTS = np.load(
+#                 f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy",
+#                 allow_pickle="TRUE",
+#             ).item()
 
-        plt.close()
+#         if run is None:
+#             scaler = clustering_RESULTS[task]["StandardScaler"]
+#             pca = clustering_RESULTS[task]["PCA"]
+#             kmeans = clustering_RESULTS[task]["kmeans"]
+#         else:
+#             scaler = clustering_RESULTS[task][run]["StandardScaler"]
+#             pca = clustering_RESULTS[task][run]["PCA"]
+#             kmeans = clustering_RESULTS[task][run]["kmeans"]
+
+#         dFC_vecs_normalized = scaler.transform(dFC_vecs)
+#         dFC_vecs_pca = pca.transform(dFC_vecs_normalized)
+#         cluster_labels = kmeans.predict(dFC_vecs_pca)
+
+#         start_TR = int(start_time / TR_mri)
+#         end_TR = int(end_time / TR_mri)
+
+#         start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
+#         end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
+
+#         fig_width = int(2.5 * (end_time - start_time) / 2)
+#         fig_width = min(fig_width, 500)
+#         plt.figure(figsize=(fig_width, 5))
+#         time = TR_array[start_TR_idx:end_TR_idx] * TR_mri
+#         plt.plot(
+#             time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4
+#         )
+#         # put vertical lines at the start of each TR
+#         for t in time:
+#             plt.axvline(x=t, color="r", linestyle="--")
+#             # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center')
+#         plt.title(f"Cluster labels of {dFC.measure.measure_name}")
+#         plt.xlabel("Time (s)")
+
+#         # save the figure
+#         output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering"
+#         if session is not None:
+#             output_dir = f"{output_dir}/{session}"
+#         output_dir = f"{output_dir}/{task}"
+#         if run is not None:
+#             output_dir = f"{output_dir}/{run}"
+#         output_dir = f"{output_dir}/"
+
+#         if not os.path.exists(output_dir):
+#             os.makedirs(output_dir)
+
+#         plt.savefig(
+#             f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}",
+#             dpi=fig_dpi,
+#             bbox_inches=fig_bbox_inches,
+#             pad_inches=fig_pad,
+#             format=save_fig_format,
+#         )
+
+#         plt.close()
 
 
 def plot_task_presence_features(
@@ -1042,28 +1126,28 @@ def create_html_report_subj_results(
                             )
                             file.write("<br>\n")
 
-                # display dFC clustering
-                img_height = 100
-                # for dFC matrices find all png files in the directory
-                dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}"
-                if os.path.exists(dFC_clustering_dir):
-                    for file_name in os.listdir(dFC_clustering_dir):
-                        if file_name.endswith(".png"):
-                            file.write(
-                                f"<h3>{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}</h3>\n"
-                            )
-                            dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}"
-                            # get the original size of the image
-                            img = plt.imread(dFC_clustering_img)
-                            height, width, _ = img.shape
-                            # change the width so that height equals img_height
-                            width = int(width * img_height / height)
-                            # replace the path to the image with a relative path
-                            dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".")
-                            file.write(
-                                f"<img src='{dFC_clustering_img}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
-                            )
-                            file.write("<br>\n")
+                # # display dFC clustering
+                # img_height = 100
+                # # for dFC matrices find all png files in the directory
+                # dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}"
+                # if os.path.exists(dFC_clustering_dir):
+                #     for file_name in os.listdir(dFC_clustering_dir):
+                #         if file_name.endswith(".png"):
+                #             file.write(
+                #                 f"<h3>{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}</h3>\n"
+                #             )
+                #             dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}"
+                #             # get the original size of the image
+                #             img = plt.imread(dFC_clustering_img)
+                #             height, width, _ = img.shape
+                #             # change the width so that height equals img_height
+                #             width = int(width * img_height / height)
+                #             # replace the path to the image with a relative path
+                #             dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".")
+                #             file.write(
+                #                 f"<img src='{dFC_clustering_img}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
+                #             )
+                #             file.write("<br>\n")
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -1189,12 +1273,31 @@ def create_html_report_group_results(
                 else:
                     clustering_dir = f"{group_dir}/clustering"
 
-                # display clustering results
+                # display clustering ARI results
+                if run is None:
+                    clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}.png"
+                else:
+                    clustering_img = (
+                        f"{clustering_dir}/clustering_results_ARI_{task}_{run}.png"
+                    )
+                img = plt.imread(clustering_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                clustering_img = clustering_img.replace(group_dir, ".")
+                file.write(
+                    f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
+                )
+
+                file.write("<br>\n")
+
+                # display clustering SI results
                 if run is None:
-                    clustering_img = f"{clustering_dir}/clustering_results_{task}.png"
+                    clustering_img = f"{clustering_dir}/clustering_results_SI_{task}.png"
                 else:
                     clustering_img = (
-                        f"{clustering_dir}/clustering_results_{task}_{run}.png"
+                        f"{clustering_dir}/clustering_results_SI_{task}_{run}.png"
                     )
                 img = plt.imread(clustering_img)
                 height, width, _ = img.shape
@@ -1218,11 +1321,11 @@ def create_html_report_group_results(
         else:
             paradigm_clustering_dir = f"{group_dir}/paradigm_clustering"
 
-        # display paradigm clustering scores
+        # display paradigm clustering ARI scores
         img_height = 300
-        file.write("<h2>Paradigm Clustering Scores</h2>\n")
+        file.write("<h2>Paradigm Clustering ARI Scores</h2>\n")
         paradigm_clustering_img = (
-            f"{paradigm_clustering_dir}/paradigm_clustering_results.png"
+            f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI.png"
         )
         img = plt.imread(paradigm_clustering_img)
         height, width, _ = img.shape
@@ -1236,36 +1339,54 @@ def create_html_report_group_results(
 
         file.write("<br>\n")
 
-        # display paradigm clustering centroids
+        # display paradigm clustering SI scores
         img_height = 300
-        file.write("<h2>Paradigm Clustering Centroids</h2>\n")
-        # find all png files in the directory
-        paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids"
-        for file_name in os.listdir(paradigm_clustering_centroids_dir):
-            if file_name.endswith(".png"):
-                measure_name = file_name[
-                    file_name.find("Task_Paradigm_Centroids_") + 24 : -4
-                ]
-                file.write(f"<h3>{measure_name}</h3>\n")
-                paradigm_clustering_centroids_img = (
-                    f"{paradigm_clustering_centroids_dir}/{file_name}"
-                )
-                # get the original size of the image
-                img = plt.imread(paradigm_clustering_centroids_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                paradigm_clustering_centroids_img = (
-                    paradigm_clustering_centroids_img.replace(group_dir, ".")
-                )
-                file.write(
-                    f"<img src='{paradigm_clustering_centroids_img}' alt='Paradigm clustering centroids' width='{width}' height='{img_height}'>\n"
-                )
-                file.write("<br>\n")
+        file.write("<h2>Paradigm Clustering SI Scores</h2>\n")
+        paradigm_clustering_img = (
+            f"{paradigm_clustering_dir}/paradigm_clustering_results_SI.png"
+        )
+        img = plt.imread(paradigm_clustering_img)
+        height, width, _ = img.shape
+        # change the width so that height equals img_height
+        width = int(width * img_height / height)
+        # replace the path to the image with a relative path
+        paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
+        file.write(
+            f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
+        )
 
         file.write("<br>\n")
 
+        # # display paradigm clustering centroids
+        # img_height = 300
+        # file.write("<h2>Paradigm Clustering Centroids</h2>\n")
+        # # find all png files in the directory
+        # paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids"
+        # for file_name in os.listdir(paradigm_clustering_centroids_dir):
+        #     if file_name.endswith(".png"):
+        #         measure_name = file_name[
+        #             file_name.find("Task_Paradigm_Centroids_") + 24 : -4
+        #         ]
+        #         file.write(f"<h3>{measure_name}</h3>\n")
+        #         paradigm_clustering_centroids_img = (
+        #             f"{paradigm_clustering_centroids_dir}/{file_name}"
+        #         )
+        #         # get the original size of the image
+        #         img = plt.imread(paradigm_clustering_centroids_img)
+        #         height, width, _ = img.shape
+        #         # change the width so that height equals img_height
+        #         width = int(width * img_height / height)
+        #         # replace the path to the image with a relative path
+        #         paradigm_clustering_centroids_img = (
+        #             paradigm_clustering_centroids_img.replace(group_dir, ".")
+        #         )
+        #         file.write(
+        #             f"<img src='{paradigm_clustering_centroids_img}' alt='Paradigm clustering centroids' width='{width}' height='{img_height}'>\n"
+        #         )
+        #         file.write("<br>\n")
+
+        # file.write("<br>\n")
+
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -1409,20 +1530,20 @@ def create_html_report_group_results(
                     except Exception as e:
                         print(f"Error in plotting task presence: {e}")
 
-                    try:
-                        plot_dFC_clustering(
-                            dFC_root=dFC_root,
-                            subj=subj,
-                            task=task,
-                            start_time=start_time,
-                            end_time=end_time,
-                            output_root=reports_root,
-                            run=run,
-                            session=session,
-                            normalize_dFC=True,
-                        )
-                    except Exception as e:
-                        print(f"Error in plotting dFC clustering: {e}")
+                    # try:
+                    #     plot_dFC_clustering(
+                    #         dFC_root=dFC_root,
+                    #         subj=subj,
+                    #         task=task,
+                    #         start_time=start_time,
+                    #         end_time=end_time,
+                    #         output_root=reports_root,
+                    #         run=run,
+                    #         session=session,
+                    #         normalize_dFC=True,
+                    #     )
+                    # except Exception as e:
+                    #     print(f"Error in plotting dFC clustering: {e}")
         # create html report
         try:
             create_html_report_subj_results(
@@ -1470,14 +1591,14 @@ def create_html_report_group_results(
         except Exception as e:
             print(f"Error in plotting paradigm clustering scores: {e}")
 
-        try:
-            plot_paradigm_clstr_centroids(
-                ML_root=ML_root,
-                output_root=reports_root,
-                session=session,
-            )
-        except Exception as e:
-            print(f"Error in plotting paradigm clustering centroids: {e}")
+        # try:
+        #     plot_paradigm_clstr_centroids(
+        #         ML_root=ML_root,
+        #         output_root=reports_root,
+        #         session=session,
+        #     )
+        # except Exception as e:
+        #     print(f"Error in plotting paradigm clustering centroids: {e}")
 
         for task in TASKS:
             for run in RUNS[task]:

From bf74c8b34fa56e1b91d3ee2bd99dcd75e8cc73fa Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 31 Jul 2024 23:58:02 -0400
Subject: [PATCH 099/401] minor change

---
 task_dFC/generate_report.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 4d1bdae..21bc05b 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1218,14 +1218,14 @@ def create_html_report_group_results(
                     classification_dir = f"{group_dir}/classification"
 
                 # display Random Forest classification results
-                file.write("<h3>Gradient Boosting</h3>\n")
+                file.write("<h3>KNN</h3>\n")
                 if run is None:
                     classification_img = (
-                        f"{classification_dir}/ML_results_classify_GBT_{task}.png"
+                        f"{classification_dir}/ML_results_classify_KNN_{task}.png"
                     )
                 else:
                     classification_img = (
-                        f"{classification_dir}/ML_results_classify_GBT_{task}_{run}.png"
+                        f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png"
                     )
                 img = plt.imread(classification_img)
                 height, width, _ = img.shape
@@ -1609,10 +1609,10 @@ def create_html_report_group_results(
                         task=task,
                         run=run,
                         session=session,
-                        ML_algorithm="Gradient Boosting",
+                        ML_algorithm="KNN",
                     )
                 except Exception as e:
-                    print(f"Error in plotting ML results for GBT: {e}")
+                    print(f"Error in plotting ML results for KNN: {e}")
                 try:
                     plot_ML_results(
                         ML_root=ML_root,

From ea220ba735d2135926a93d482dcd6b3c837fcf12 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 2 Aug 2024 15:31:13 -0400
Subject: [PATCH 100/401] concat+embed LE

---
 task_dFC/ML.py | 151 ++++++++++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 66 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index eb35ccd..62c92d6 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -290,53 +290,88 @@ def embed_dFC_features(
     embedding="PCA",
     n_components=30,
     n_neighbors_LE=125,
+    LE_embedding_method="concat+embed",
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
 
     for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects.
     All the subjects are transformed into the space of the subject with the highest silhouette score.
+
+    LE_embedding_method: "concat+embed" or "embed+procrustes"
     """
     if embedding == "PCA":
         pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
         pca.fit(X_train)
         X_train_embed = pca.transform(X_train)
-        X_test_embed = pca.transform(X_test)
+        if X_test is not None:
+            X_test_embed = pca.transform(X_test)
+        else:
+            X_test_embed = None
     elif embedding == "LE":
-        # first embed the dFC features of each subject into a lower dimensional space using LE separately
-        embed_dict = {}
-        for subject in train_subjects:
-            # assert the samples of the same subject are contiguous
-            assert np.all(
-                np.diff(np.where(subj_label_train == subject)[0]) == 1
-            ), f"Indices of {subject} are not consecutive"
-            X_subj = X_train[subj_label_train == subject, :]
-            y_subj = y_train[subj_label_train == subject]
-            LE = SpectralEmbedding(
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-            )
-            X_subj_embed = LE.fit_transform(X_subj)
-            SI = silhouette_score(X_subj_embed, y_subj)
-            embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
-
-        # find the best transformation based on the SI score
-        best_SI = -1
-        best_subject = None
-        for subject in embed_dict:
-            if embed_dict[subject]["SI"] > best_SI:
-                best_SI = embed_dict[subject]["SI"]
-                best_subject = subject
-
-        # apply procrustes transformation to align the embeddings of different subjects
-        # use the embeddings of the subject with the highest SI score as the reference
-        X_train_embed = None
-        for subject in train_subjects:
-            X_subj_embed = embed_dict[subject]["X_subj_embed"]
-            # procrustes transformation
-            if subject == best_subject:
-                X_subj_embed_transformed = X_subj_embed
-            else:
+        if LE_embedding_method == "embed+procrustes":
+            # first embed the dFC features of each subject into a lower dimensional space using LE separately
+            embed_dict = {}
+            for subject in train_subjects:
+                # assert the samples of the same subject are contiguous
+                assert np.all(
+                    np.diff(np.where(subj_label_train == subject)[0]) == 1
+                ), f"Indices of {subject} are not consecutive"
+                X_subj = X_train[subj_label_train == subject, :]
+                y_subj = y_train[subj_label_train == subject]
+                LE = SpectralEmbedding(
+                    n_components=n_components,
+                    n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                )
+                X_subj_embed = LE.fit_transform(X_subj)
+                SI = silhouette_score(X_subj_embed, y_subj)
+                embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
+
+            # find the best transformation based on the SI score
+            best_SI = -1
+            best_subject = None
+            for subject in embed_dict:
+                if embed_dict[subject]["SI"] > best_SI:
+                    best_SI = embed_dict[subject]["SI"]
+                    best_subject = subject
+
+            # apply procrustes transformation to align the embeddings of different subjects
+            # use the embeddings of the subject with the highest SI score as the reference
+            X_train_embed = None
+            for subject in train_subjects:
+                X_subj_embed = embed_dict[subject]["X_subj_embed"]
+                # procrustes transformation
+                if subject == best_subject:
+                    X_subj_embed_transformed = X_subj_embed
+                else:
+                    # for the procrustes transformation, the number of samples should be the same
+                    X_best_subj_embed = precheck_for_procruste(
+                        embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+                    )
+                    _, X_subj_embed_transformed, _ = procrustes(
+                        X_best_subj_embed, X_subj_embed
+                    )
+                if X_train_embed is None:
+                    X_train_embed = X_subj_embed_transformed
+                else:
+                    X_train_embed = np.concatenate(
+                        (X_train_embed, X_subj_embed_transformed), axis=0
+                    )
+
+            # apply the same transformation to the test set
+            X_test_embed = None
+            for subject in test_subjects:
+                # assert the samples of the same subject are contiguous
+                assert np.all(
+                    np.diff(np.where(subj_label_test == subject)[0]) == 1
+                ), f"Indices of {subject} are not consecutive"
+                X_subj = X_test[subj_label_test == subject, :]
+                LE = SpectralEmbedding(
+                    n_components=n_components,
+                    n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                )
+                X_subj_embed = LE.fit_transform(X_subj)
+                # procrustes transformation
                 # for the procrustes transformation, the number of samples should be the same
                 X_best_subj_embed = precheck_for_procruste(
                     embed_dict[best_subject]["X_subj_embed"], X_subj_embed
@@ -344,38 +379,19 @@ def embed_dFC_features(
                 _, X_subj_embed_transformed, _ = procrustes(
                     X_best_subj_embed, X_subj_embed
                 )
-            if X_train_embed is None:
-                X_train_embed = X_subj_embed_transformed
-            else:
-                X_train_embed = np.concatenate(
-                    (X_train_embed, X_subj_embed_transformed), axis=0
-                )
-
-        # apply the same transformation to the test set
-        X_test_embed = None
-        for subject in test_subjects:
-            # assert the samples of the same subject are contiguous
-            assert np.all(
-                np.diff(np.where(subj_label_test == subject)[0]) == 1
-            ), f"Indices of {subject} are not consecutive"
-            X_subj = X_test[subj_label_test == subject, :]
-            LE = SpectralEmbedding(
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-            )
-            X_subj_embed = LE.fit_transform(X_subj)
-            # procrustes transformation
-            # for the procrustes transformation, the number of samples should be the same
-            X_best_subj_embed = precheck_for_procruste(
-                embed_dict[best_subject]["X_subj_embed"], X_subj_embed
-            )
-            _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed)
-            if X_test_embed is None:
-                X_test_embed = X_subj_embed_transformed
+                if X_test_embed is None:
+                    X_test_embed = X_subj_embed_transformed
+                else:
+                    X_test_embed = np.concatenate(
+                        (X_test_embed, X_subj_embed_transformed), axis=0
+                    )
+        elif LE_embedding_method == "concat+embed":
+            LE = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors_LE)
+            X_train_embed = LE.fit_transform(X_train)
+            if X_test is not None:
+                X_test_embed = LE.transform(X_test)
             else:
-                X_test_embed = np.concatenate(
-                    (X_test_embed, X_subj_embed_transformed), axis=0
-                )
+                X_test_embed = None
 
     return X_train_embed, X_test_embed
 
@@ -698,6 +714,7 @@ def task_presence_classification(
         embedding="LE",
         n_components=30,
         n_neighbors_LE=125,
+        LE_embedding_method="concat+embed",
     )
 
     # task presence classification
@@ -831,6 +848,7 @@ def task_presence_clustering(
         embedding="LE",
         n_components=30,
         n_neighbors_LE=125,
+        LE_embedding_method="concat+embed",
     )
 
     # clustering
@@ -1083,6 +1101,7 @@ def task_paradigm_clustering(
             embedding="LE",
             n_components=30,
             n_neighbors_LE=125,
+            LE_embedding_method="concat+embed",
         )
 
         # clustering

From d63f7ec70b90f8f29f7c544e729fcbc1f0e12735 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 2 Aug 2024 16:40:33 -0400
Subject: [PATCH 101/401] minor fix

---
 task_dFC/ML.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 62c92d6..84472b6 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -386,10 +386,16 @@ def embed_dFC_features(
                         (X_test_embed, X_subj_embed_transformed), axis=0
                     )
         elif LE_embedding_method == "concat+embed":
+            # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data
+            if X_test is not None:
+                X_concat = np.concatenate((X_train, X_test), axis=0)
+            else:
+                X_concat = X_train
             LE = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors_LE)
-            X_train_embed = LE.fit_transform(X_train)
+            X_concat_embed = LE.fit_transform(X_concat)
+            X_train_embed = X_concat_embed[: X_train.shape[0], :]
             if X_test is not None:
-                X_test_embed = LE.transform(X_test)
+                X_test_embed = X_concat_embed[X_train.shape[0] :, :]
             else:
                 X_test_embed = None
 

From 1b5a439a4c99b29f79144ed4e68e23a616390c1f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 6 Aug 2024 17:58:16 -0400
Subject: [PATCH 102/401] add generalized LE and corr distance for LE

---
 task_dFC/ML.py | 343 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 262 insertions(+), 81 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 84472b6..8c16cca 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -12,7 +12,7 @@
 from sklearn.manifold import SpectralEmbedding
 from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
-from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
@@ -278,6 +278,245 @@ def precheck_for_procruste(X_best, X_subj):
     return X_best_new
 
 
+def generalized_procrustes(X_list):
+    """
+    Generalized Procrustes Analysis
+
+    returns the mean X to be used as the reference for procrustes transformation
+    """
+    # initialize Procrustes distance
+    current_distance = 0
+
+    # initialize a mean X
+    mean_X = np.array(X_list[0])
+
+    num_X = len(X_list)
+
+    # create array for new Xs, add
+    new_Xs = np.zeros(np.array(X_list).shape)
+
+    while True:
+        # add the mean X as first element of array
+        new_Xs[0] = mean_X
+
+        # superimpose all shapes to current mean
+        for i in range(1, num_X):
+            _, new_X, _ = procrustes(mean_X, X_list[i])
+            new_Xs[i] = new_X
+
+        # calculate new mean
+        new_mean = np.mean(new_Xs, axis=0)
+
+        _, _, new_distance = procrustes(new_mean, mean_X)
+
+        # if the distance did not change, break the cycle
+        if np.abs(new_distance - current_distance) < 1e-6:
+            break
+
+        # align the new_mean to old mean
+        _, new_mean, _ = procrustes(mean_X, new_mean)
+
+        # update mean and distance
+        mean_X = new_mean
+        current_distance = new_distance
+
+    return mean_X
+
+
+def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
+    """
+    Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space.
+    """
+    affinity_matrix = kneighbors_graph(
+        X,
+        n_neighbors=n_neighbors,
+        mode="connectivity",
+        include_self=False,
+        metric=distance_metric,
+    )
+    affinity_matrix = affinity_matrix.toarray()
+    affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2)
+    LE = SpectralEmbedding(
+        n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors
+    )
+    X_embed = LE.fit_transform(X=affinity_matrix)
+    return X_embed
+
+
+def LE_embed_procustes(
+    X_train,
+    X_test,
+    y_train,
+    y_test,
+    subj_label_train,
+    subj_label_test,
+    train_subjects,
+    test_subjects,
+    n_components=30,
+    n_neighbors_LE=125,
+    procruste_method="best_SI",
+):
+    if procruste_method == "best_SI":
+        # first embed the dFC features of each subject into a lower dimensional space using LE separately
+        embed_dict = {}
+        for subject in train_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_train == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_train[subj_label_train == subject, :]
+            y_subj = y_train[subj_label_train == subject]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            SI = silhouette_score(X_subj_embed, y_subj)
+            embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
+
+        # find the best transformation based on the SI score
+        best_SI = -1
+        best_subject = None
+        for subject in embed_dict:
+            if embed_dict[subject]["SI"] > best_SI:
+                best_SI = embed_dict[subject]["SI"]
+                best_subject = subject
+
+        # apply procrustes transformation to align the embeddings of different subjects
+        # use the embeddings of the subject with the highest SI score as the reference
+        X_train_embed = None
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]["X_subj_embed"]
+            # procrustes transformation
+            if subject == best_subject:
+                X_subj_embed_transformed = X_subj_embed
+            else:
+                # for the procrustes transformation, the number of samples should be the same
+                X_best_subj_embed = precheck_for_procruste(
+                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+                )
+                _, X_subj_embed_transformed, _ = procrustes(
+                    X_best_subj_embed, X_subj_embed
+                )
+            if X_train_embed is None:
+                X_train_embed = X_subj_embed_transformed
+            else:
+                X_train_embed = np.concatenate(
+                    (X_train_embed, X_subj_embed_transformed), axis=0
+                )
+
+        # apply the same transformation to the test set
+        X_test_embed = None
+        for subject in test_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_test == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_test[subj_label_test == subject, :]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            # procrustes transformation
+            # for the procrustes transformation, the number of samples should be the same
+            X_best_subj_embed = precheck_for_procruste(
+                embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+            )
+            _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed)
+            if X_test_embed is None:
+                X_test_embed = X_subj_embed_transformed
+            else:
+                X_test_embed = np.concatenate(
+                    (X_test_embed, X_subj_embed_transformed), axis=0
+                )
+
+    elif procruste_method == "generalized":
+        # in this method we use generalized procrustes analysis to align the embeddings of different subjects
+        # first embed the dFC features of each subject into a lower dimensional space using LE separately
+        embed_dict = {}
+        for subject in train_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_train == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_train[subj_label_train == subject, :]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            embed_dict[subject] = X_subj_embed
+
+        # then find the max number of samples among all subjects
+        max_samples = 0
+        for subject in train_subjects:
+            if embed_dict[subject].shape[0] > max_samples:
+                max_samples = embed_dict[subject].shape[0]
+
+        # find the mean embedding of all subjects to use as the reference for procrustes transformation
+        X_train_list = []
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]
+            # add zero rows to the embedding of the subject with less samples
+            if X_subj_embed.shape[0] < max_samples:
+                X_subj_embed_new = np.concatenate(
+                    (
+                        X_subj_embed,
+                        np.zeros(
+                            (
+                                max_samples - X_subj_embed.shape[0],
+                                X_subj_embed.shape[1],
+                            )
+                        ),
+                    ),
+                    axis=0,
+                )
+            else:
+                X_subj_embed_new = X_subj_embed
+            X_train_list.append(X_subj_embed_new)
+        mean_X_train = generalized_procrustes(X_train_list)
+
+        X_train_embed = None
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]
+            mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
+            _, X_subj_embed_transformed, _ = procrustes(
+                mean_X_train_new_size, X_subj_embed
+            )
+            if X_train_embed is None:
+                X_train_embed = X_subj_embed_transformed
+            else:
+                X_train_embed = np.concatenate(
+                    (X_train_embed, X_subj_embed_transformed), axis=0
+                )
+
+        X_test_embed = None
+        for subject in test_subjects:
+            X_subj = X_test[subj_label_test == subject, :]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
+            _, X_subj_embed_transformed, _ = procrustes(
+                mean_X_train_new_size, X_subj_embed
+            )
+            if X_test_embed is None:
+                X_test_embed = X_subj_embed_transformed
+            else:
+                X_test_embed = np.concatenate(
+                    (X_test_embed, X_subj_embed_transformed), axis=0
+                )
+
+    return X_train_embed, X_test_embed
+
+
 def embed_dFC_features(
     train_subjects,
     test_subjects,
@@ -310,89 +549,31 @@ def embed_dFC_features(
             X_test_embed = None
     elif embedding == "LE":
         if LE_embedding_method == "embed+procrustes":
-            # first embed the dFC features of each subject into a lower dimensional space using LE separately
-            embed_dict = {}
-            for subject in train_subjects:
-                # assert the samples of the same subject are contiguous
-                assert np.all(
-                    np.diff(np.where(subj_label_train == subject)[0]) == 1
-                ), f"Indices of {subject} are not consecutive"
-                X_subj = X_train[subj_label_train == subject, :]
-                y_subj = y_train[subj_label_train == subject]
-                LE = SpectralEmbedding(
-                    n_components=n_components,
-                    n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-                )
-                X_subj_embed = LE.fit_transform(X_subj)
-                SI = silhouette_score(X_subj_embed, y_subj)
-                embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
-
-            # find the best transformation based on the SI score
-            best_SI = -1
-            best_subject = None
-            for subject in embed_dict:
-                if embed_dict[subject]["SI"] > best_SI:
-                    best_SI = embed_dict[subject]["SI"]
-                    best_subject = subject
-
-            # apply procrustes transformation to align the embeddings of different subjects
-            # use the embeddings of the subject with the highest SI score as the reference
-            X_train_embed = None
-            for subject in train_subjects:
-                X_subj_embed = embed_dict[subject]["X_subj_embed"]
-                # procrustes transformation
-                if subject == best_subject:
-                    X_subj_embed_transformed = X_subj_embed
-                else:
-                    # for the procrustes transformation, the number of samples should be the same
-                    X_best_subj_embed = precheck_for_procruste(
-                        embed_dict[best_subject]["X_subj_embed"], X_subj_embed
-                    )
-                    _, X_subj_embed_transformed, _ = procrustes(
-                        X_best_subj_embed, X_subj_embed
-                    )
-                if X_train_embed is None:
-                    X_train_embed = X_subj_embed_transformed
-                else:
-                    X_train_embed = np.concatenate(
-                        (X_train_embed, X_subj_embed_transformed), axis=0
-                    )
-
-            # apply the same transformation to the test set
-            X_test_embed = None
-            for subject in test_subjects:
-                # assert the samples of the same subject are contiguous
-                assert np.all(
-                    np.diff(np.where(subj_label_test == subject)[0]) == 1
-                ), f"Indices of {subject} are not consecutive"
-                X_subj = X_test[subj_label_test == subject, :]
-                LE = SpectralEmbedding(
-                    n_components=n_components,
-                    n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-                )
-                X_subj_embed = LE.fit_transform(X_subj)
-                # procrustes transformation
-                # for the procrustes transformation, the number of samples should be the same
-                X_best_subj_embed = precheck_for_procruste(
-                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
-                )
-                _, X_subj_embed_transformed, _ = procrustes(
-                    X_best_subj_embed, X_subj_embed
-                )
-                if X_test_embed is None:
-                    X_test_embed = X_subj_embed_transformed
-                else:
-                    X_test_embed = np.concatenate(
-                        (X_test_embed, X_subj_embed_transformed), axis=0
-                    )
+            X_train_embed, X_test_embed = LE_embed_procustes(
+                X_train=X_train,
+                X_test=X_test,
+                y_train=y_train,
+                y_test=y_test,
+                subj_label_train=subj_label_train,
+                subj_label_test=subj_label_test,
+                train_subjects=train_subjects,
+                test_subjects=test_subjects,
+                n_components=n_components,
+                n_neighbors_LE=n_neighbors_LE,
+                procruste_method="generalized",
+            )
         elif LE_embedding_method == "concat+embed":
             # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data
             if X_test is not None:
                 X_concat = np.concatenate((X_train, X_test), axis=0)
             else:
                 X_concat = X_train
-            LE = SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors_LE)
-            X_concat_embed = LE.fit_transform(X_concat)
+            X_concat_embed = LE_transform(
+                X=X_concat,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_concat.shape[0]),
+                distance_metric="correlation",
+            )
             X_train_embed = X_concat_embed[: X_train.shape[0], :]
             if X_test is not None:
                 X_test_embed = X_concat_embed[X_train.shape[0] :, :]
@@ -720,7 +901,7 @@ def task_presence_classification(
         embedding="LE",
         n_components=30,
         n_neighbors_LE=125,
-        LE_embedding_method="concat+embed",
+        LE_embedding_method="embed+procrustes",
     )
 
     # task presence classification
@@ -854,7 +1035,7 @@ def task_presence_clustering(
         embedding="LE",
         n_components=30,
         n_neighbors_LE=125,
-        LE_embedding_method="concat+embed",
+        LE_embedding_method="embed+procrustes",
     )
 
     # clustering
@@ -1107,7 +1288,7 @@ def task_paradigm_clustering(
             embedding="LE",
             n_components=30,
             n_neighbors_LE=125,
-            LE_embedding_method="concat+embed",
+            LE_embedding_method="embed+procrustes",
         )
 
         # clustering

From 84de66bc582102faebae066fd3148f015d822747 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 8 Aug 2024 12:03:48 -0400
Subject: [PATCH 103/401] remove outdated scripts

---
 simul_dFC/FCS_estimate.py   | 146 ------------
 simul_dFC/KNN_ML.py         | 460 ------------------------------------
 simul_dFC/dFC_assessment.py | 102 --------
 3 files changed, 708 deletions(-)
 delete mode 100644 simul_dFC/FCS_estimate.py
 delete mode 100644 simul_dFC/KNN_ML.py
 delete mode 100644 simul_dFC/dFC_assessment.py

diff --git a/simul_dFC/FCS_estimate.py b/simul_dFC/FCS_estimate.py
deleted file mode 100644
index 0fd7653..0000000
--- a/simul_dFC/FCS_estimate.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import os
-import time
-import warnings
-
-import numpy as np
-
-from pydfc import MultiAnalysis, data_loader
-
-warnings.simplefilter("ignore")
-
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
-
-################################# Parameters #################################
-# data paths
-dataset = "ds000002"
-# main_root = f"./DATA/{dataset}" # for local
-main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}"  # for server
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-output_root = f"{main_root}/derivatives/fitted_MEASURES"
-
-TASKS = [
-    "task-midFreqMidRest",
-    "task-lowFreqLongRest",
-    "task-lowFreqShortRest",
-    "task-lowFreqShortTask",
-    "task-highFreqLongRest",
-    "task-highFreqShortRest",
-    "task-midFreqMidRestNoisy",
-]
-
-job_id = int(os.getenv("SGE_TASK_ID"))
-TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
-if TASK_id >= len(TASKS):
-    print("TASK_id out of TASKS")
-    exit()
-task = TASKS[TASK_id]
-
-###### MEASUREMENT PARAMETERS ######
-
-# W is in sec
-
-params_methods = {
-    # Sliding Parameters
-    "W": 12,
-    "n_overlap": 1.0,
-    "sw_method": "pear_corr",
-    "tapered_window": True,
-    # TIME_FREQ
-    "TF_method": "WTC",
-    # CLUSTERING AND DHMM
-    "clstr_base_measure": "SlidingWindow",
-    # HMM
-    "hmm_iter": 20,
-    "dhmm_obs_state_ratio": 16 / 24,
-    # State Parameters
-    "n_states": 5,
-    "n_subj_clstrs": 10,
-    # Parallelization Parameters
-    "n_jobs": 2,
-    "verbose": 0,
-    "backend": "loky",
-    # SESSION
-    "session": task,
-    # Hyper Parameters
-    "normalization": True,
-    "num_subj": None,
-    "num_time_point": None,
-}
-
-###### HYPER PARAMETERS ALTERNATIVE ######
-
-MEASURES_name_lst = [
-    "SlidingWindow",
-    "Time-Freq",
-    "CAP",
-    "ContinuousHMM",
-    "Windowless",
-    "Clustering",
-    "DiscreteHMM",
-]
-
-alter_hparams = {
-    # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'],
-    # 'n_overlap': [0, 0.25, 0.75, 1],
-    # 'n_states': [6, 16],
-    # # 'normalization': [],
-    # 'num_subj': [50, 100, 200],
-    # 'num_select_nodes': [30, 50, 333],
-    # 'num_time_point': [800, 1000],
-    # 'Fs_ratio': [0.50, 0.75, 1.5],
-    # 'noise_ratio': [1.00, 2.00, 3.00],
-    # 'num_realization': []
-}
-
-###### MultiAnalysis PARAMETERS ######
-
-params_multi_analysis = {
-    # Parallelization Parameters
-    "n_jobs": None,
-    "verbose": 0,
-    "backend": "loky",
-}
-
-################################# LOAD DATA #################################
-
-BOLD = data_loader.load_TS(
-    data_root=roi_root,
-    file_name="{subj_id}_{task}_time-series.npy",
-    SESSIONs=task,
-    subj_id2load=None,
-    task=task,
-)
-################################ Measures of dFC #################################
-
-MA = MultiAnalysis(
-    analysis_name=f"simulated-task-based-dFC-{dataset}-{task}", **params_multi_analysis
-)
-
-MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams)
-
-tic = time.time()
-print("Measurement Started ...")
-
-################################# estimate FCS #################################
-
-for MEASURE_id, measure in enumerate(MEASURES_lst):
-
-    print("MEASURE: " + measure.measure_name)
-    print("FCS estimation started...")
-
-    if measure.is_state_based:
-        measure.estimate_FCS(time_series=BOLD)
-
-    print("FCS estimation done.")
-
-    # Save
-    if not os.path.exists(f"{output_root}"):
-        os.makedirs(f"{output_root}")
-    np.save(f"{output_root}/MEASURE_{task}_{MEASURE_id}.npy", measure)
-
-print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-np.save(f"{output_root}/multi-analysis_{task}.npy", MA)
-
-#################################################################################
diff --git a/simul_dFC/KNN_ML.py b/simul_dFC/KNN_ML.py
deleted file mode 100644
index c1b60cc..0000000
--- a/simul_dFC/KNN_ML.py
+++ /dev/null
@@ -1,460 +0,0 @@
-import argparse
-import json
-import os
-
-import numpy as np
-from sklearn.decomposition import PCA
-from sklearn.metrics import balanced_accuracy_score
-from sklearn.model_selection import GridSearchCV
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
-
-from pydfc import DFC, data_loader, task_utils
-from pydfc.dfc_utils import dFC_mat2vec, rank_norm
-
-#######################################################################################
-
-
-def find_available_subjects(dFC_root, task, dFC_id=None):
-    """
-    Find the subjects that have dFC results for the given task and dFC_id (method).
-    """
-    SUBJECTS = list()
-    ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
-    ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder]
-    ALL_SUBJ_FOLDERS.sort()
-    for subj_folder in ALL_SUBJ_FOLDERS:
-        ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
-        ALL_DFC_FILES = [dFC_file for dFC_file in ALL_DFC_FILES if task in dFC_file]
-        if dFC_id is not None:
-            ALL_DFC_FILES = [
-                dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
-            ]
-        ALL_DFC_FILES.sort()
-        if len(ALL_DFC_FILES) > 0:
-            SUBJECTS.append(subj_folder)
-    return SUBJECTS
-
-
-def extract_task_features(TASKS, roi_root, output_root):
-    """
-    Extract task features from the event data."""
-    task_features = {
-        "task": list(),
-        "relative_task_on": list(),
-        "avg_task_duration": list(),
-        "var_task_duration": list(),
-        "avg_rest_duration": list(),
-        "var_rest_duration": list(),
-        "num_of_transitions": list(),
-        "relative_transition_freq": list(),
-    }
-    for task_id, task in enumerate(TASKS):
-
-        if task == "task-restingstate":
-            continue
-
-        SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task)
-
-        for subj in SUBJECTS:
-            # event data
-            task_data = np.load(
-                f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-            ).item()
-            Fs_task = task_data["Fs_task"]
-            TR_task = 1 / Fs_task
-
-            task_presence = task_utils.extract_task_presence(
-                event_labels=task_data["event_labels"],
-                TR_task=TR_task,
-                TR_mri=task_data["TR_mri"],
-                binary=True,
-            )
-
-            relative_task_on = task_utils.relative_task_on(task_presence)
-            # task duration
-            avg_task_duration, var_task_duration = task_utils.task_duration(
-                task_presence, task_data["TR_mri"]
-            )
-            # rest duration
-            avg_rest_duration, var_rest_duration = task_utils.rest_duration(
-                task_presence, task_data["TR_mri"]
-            )
-            # freq of transitions
-            num_of_transitions, relative_transition_freq = task_utils.transition_freq(
-                task_presence
-            )
-
-            task_features["task"].append(task)
-            task_features["relative_task_on"].append(relative_task_on)
-            task_features["avg_task_duration"].append(avg_task_duration)
-            task_features["var_task_duration"].append(var_task_duration)
-            task_features["avg_rest_duration"].append(avg_rest_duration)
-            task_features["var_rest_duration"].append(var_rest_duration)
-            task_features["num_of_transitions"].append(num_of_transitions)
-            task_features["relative_transition_freq"].append(relative_transition_freq)
-
-    folder = f"{output_root}"
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    np.save(f"{folder}/task_features_KNN_classify.npy", task_features)
-
-
-def dFC_feature_extraction_subj_lvl(
-    dFC,
-    task_data,
-    dynamic_pred="no",
-    normalize_dFC=True,
-):
-    """
-    Extract features and target for task presence classification
-    for a single subject.
-    """
-    # dFC features
-    dFC_mat = dFC.get_dFC_mat()
-    TR_array = dFC.TR_array
-    if normalize_dFC:
-        dFC_mat = rank_norm(dFC_mat)
-    dFC_vecs = dFC_mat2vec(dFC_mat)
-
-    # event data
-    task_presence = task_utils.extract_task_presence(
-        event_labels=task_data["event_labels"],
-        TR_task=1 / task_data["Fs_task"],
-        TR_mri=task_data["TR_mri"],
-        TR_array=TR_array,
-        binary=True,
-    )
-
-    features = dFC_vecs
-    target = task_presence.ravel()
-
-    if dynamic_pred == "past":
-        # concat current TR and two TR before of features to predict the current TR of target
-        # ignore the edge case of the first two TRs
-        features = np.concatenate(
-            (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1
-        )
-        features = features[2:, :]
-        target = target[2:]
-    elif dynamic_pred == "past_and_future":
-        # concat current TR and two TR before and after of features to predict the current TR of target
-        # ignore the edge case of the first and last two TRs
-        features = np.concatenate(
-            (
-                features,
-                np.roll(features, 1, axis=0),
-                np.roll(features, 2, axis=0),
-                np.roll(features, -1, axis=0),
-                np.roll(features, -2, axis=0),
-            ),
-            axis=1,
-        )
-        features = features[2:-2, :]
-        target = target[2:-2]
-
-    return features, target
-
-
-def dFC_feature_extraction(
-    task,
-    train_subjects,
-    test_subjects,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    dynamic_pred="no",
-    normalize_dFC=True,
-):
-    """
-    Extract features and target for task presence classification
-    for all subjects.
-    """
-    X_train = None
-    y_train = None
-    subj_label_train = list()
-    for subj in train_subjects:
-        dFC = np.load(
-            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-        ).item()
-
-        task_data = np.load(
-            f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-        ).item()
-
-        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
-            dFC=dFC,
-            task_data=task_data,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-        )
-
-        subj_label_train.extend([subj for i in range(X_subj.shape[0])])
-        if X_train is None and y_train is None:
-            X_train = X_subj
-            y_train = y_subj
-        else:
-            X_train = np.concatenate((X_train, X_subj), axis=0)
-            y_train = np.concatenate((y_train, y_subj), axis=0)
-
-    X_test = None
-    y_test = None
-    subj_label_test = list()
-    for subj in test_subjects:
-        dFC = np.load(
-            f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-        ).item()
-
-        task_data = np.load(
-            f"{roi_root}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-        ).item()
-
-        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
-            dFC=dFC,
-            task_data=task_data,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-        )
-
-        subj_label_test.extend([subj for i in range(X_subj.shape[0])])
-        if X_test is None and y_test is None:
-            X_test = X_subj
-            y_test = y_subj
-        else:
-            X_test = np.concatenate((X_test, X_subj), axis=0)
-            y_test = np.concatenate((y_test, y_subj), axis=0)
-
-    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
-    subj_label_train = np.array(subj_label_train)
-    subj_label_test = np.array(subj_label_test)
-
-    return (
-        X_train,
-        X_test,
-        y_train,
-        y_test,
-        subj_label_train,
-        subj_label_test,
-        dFC.measure.measure_name,
-    )
-
-
-def task_presence_classification(
-    task,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    dynamic_pred="no",
-    normalize_dFC=True,
-    train_test_ratio=0.8,
-    explained_var_threshold=0.95,
-):
-    print(f"=============== {task} ===============")
-
-    if task == "task-restingstate":
-        return
-
-    SUBJECTS = find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id)
-
-    # randomly select train_test_ratio of the subjects for training
-    # and rest for testing using numpy.random.choice
-    train_subjects = np.random.choice(
-        SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False
-    )
-    test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
-    print(
-        f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
-    )
-
-    X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = (
-        dFC_feature_extraction(
-            task=task,
-            train_subjects=train_subjects,
-            test_subjects=test_subjects,
-            dFC_id=dFC_id,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-        )
-    )
-
-    # task presence classification
-
-    print("task presence classification ...")
-
-    # find num_PCs
-    pca = PCA(svd_solver="full", whiten=False)
-    pca.fit(X_train)
-    num_PCs = (
-        np.where(np.cumsum(pca.explained_variance_ratio_) > explained_var_threshold)[0][0]
-        + 1
-    )
-
-    # create a pipeline with a knn model to find the best n_neighbors
-    knn = make_pipeline(
-        StandardScaler(),
-        PCA(n_components=num_PCs),
-        KNeighborsClassifier(),
-    )
-    # create a dictionary of all values we want to test for n_neighbors
-    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
-    # use gridsearch to test all values for n_neighbors
-    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
-    # fit model to data
-    knn_gscv.fit(X_train, y_train)
-
-    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
-
-    neigh = make_pipeline(
-        StandardScaler(),
-        PCA(n_components=num_PCs),
-        KNeighborsClassifier(n_neighbors=n_neighbors),
-    ).fit(X_train, y_train)
-
-    ML_RESULT = {
-        "pca": pca,
-        "num_PCs": num_PCs,
-        "cv_results": knn_gscv.cv_results_,
-        "KNN": neigh,
-        "KNN train score": neigh.score(X_train, y_train),
-        "KNN test score": neigh.score(X_test, y_test),
-    }
-
-    print(f"KNN train score {measure_name} {task}: {neigh.score(X_train, y_train)}")
-    print(f"KNN test score {measure_name} {task}: {neigh.score(X_test, y_test)}")
-
-    # measure pred score on each subj
-
-    ML_scores = {
-        "subj_id": list(),
-        "group": list(),
-        "task": list(),
-        "dFC method": list(),
-        "KNN accuracy": list(),
-    }
-    for subj in SUBJECTS:
-        ML_scores["subj_id"].append(subj)
-        if subj in train_subjects:
-            ML_scores["group"].append("train")
-            features = X_train[subj_label_train == subj, :]
-            target = y_train[subj_label_train == subj]
-        elif subj in test_subjects:
-            ML_scores["group"].append("test")
-            features = X_test[subj_label_test == subj, :]
-            target = y_test[subj_label_test == subj]
-
-        pred = neigh.predict(features)
-
-        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred))
-
-        ML_scores["task"].append(task)
-        ML_scores["dFC method"].append(measure_name)
-
-    return ML_RESULT, ML_scores
-
-
-def run_classification(
-    TASKS,
-    roi_root,
-    dFC_root,
-    output_root,
-    dynamic_pred="no",
-    normalize_dFC=True,
-):
-    ML_scores = {
-        "subj_id": list(),
-        "group": list(),
-        "task": list(),
-        "dFC method": list(),
-        "KNN accuracy": list(),
-    }
-    for dFC_id in range(0, 7):
-        print(f"=================== dFC {dFC_id} ===================")
-
-        ML_RESULT = {}
-        for task_id, task in enumerate(TASKS):
-            ML_RESULT_new, ML_scores_new = task_presence_classification(
-                task=task,
-                dFC_id=dFC_id,
-                roi_root=roi_root,
-                dFC_root=dFC_root,
-                dynamic_pred=dynamic_pred,
-                normalize_dFC=normalize_dFC,
-            )
-            ML_RESULT[task] = ML_RESULT_new
-            for key in ML_scores:
-                ML_scores[key].extend(ML_scores_new[key])
-
-        folder = f"{output_root}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-        np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
-
-    np.save(f"{folder}/ML_scores_KNN_classify.npy", ML_scores)
-
-
-#######################################################################################
-
-if __name__ == "__main__":
-    # argparse
-    HELPTEXT = """
-    Script to apply Machine Learning on dFC results to predict task presence.
-    """
-
-    parser = argparse.ArgumentParser(description=HELPTEXT)
-
-    parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
-
-    args = parser.parse_args()
-
-    dataset_info_file = args.dataset_info
-
-    # Read global configs
-    with open(dataset_info_file, "r") as f:
-        dataset_info = json.load(f)
-
-    print("Task presence prediction started ...")
-
-    TASKS = dataset_info["TASKS"]
-
-    if "{dataset}" in dataset_info["main_root"]:
-        main_root = dataset_info["main_root"].replace(
-            "{dataset}", dataset_info["dataset"]
-        )
-    else:
-        main_root = dataset_info["main_root"]
-
-    if "{main_root}" in dataset_info["roi_root"]:
-        roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
-    else:
-        roi_root = dataset_info["roi_root"]
-
-    if "{main_root}" in dataset_info["dFC_root"]:
-        dFC_root = dataset_info["dFC_root"].replace("{main_root}", main_root)
-    else:
-        dFC_root = dataset_info["dFC_root"]
-
-    if "{main_root}" in dataset_info["ML_root"]:
-        ML_root = dataset_info["ML_root"].replace("{main_root}", main_root)
-    else:
-        ML_root = dataset_info["ML_root"]
-
-    extract_task_features(
-        TASKS=TASKS,
-        roi_root=roi_root,
-        output_root=ML_root,
-    )
-    run_classification(
-        TASKS=TASKS,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        output_root=ML_root,
-        dynamic_pred="no",
-        normalize_dFC=True,
-    )
-
-    print("Task presence prediction CODE finished running.")
-
-#######################################################################################
diff --git a/simul_dFC/dFC_assessment.py b/simul_dFC/dFC_assessment.py
deleted file mode 100644
index d140bd6..0000000
--- a/simul_dFC/dFC_assessment.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import os
-import time
-import warnings
-
-import numpy as np
-
-from pydfc import MultiAnalysis, data_loader
-
-warnings.simplefilter("ignore")
-
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
-
-################################# Parameters #################################
-
-# Data parameters
-dataset = "ds000001"
-# main_root = f"./DATA/{dataset}" # for local
-main_root = f"../../DATA/task-based/simulated/{dataset}"  # for server
-
-# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate
-# you can set the new roi root and data load parameters here:
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES"
-output_root = f"{main_root}/derivatives/dFC_assessed"
-
-# for consistency we use 0 for resting state. will this cause a problem here??
-TASKS = ["task-pulse"]
-
-# find all subjects across all tasks
-SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
-
-# job_id selects the subject
-job_id = int(os.getenv("SGE_TASK_ID"))
-if job_id > len(SUBJECTS):
-    print("job_id > len(SUBJECTS)")
-    exit()
-subj_id = SUBJECTS[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
-
-for task in TASKS:
-
-    MA = np.load(
-        f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE"
-    ).item()
-
-    # check if the subject has this task
-    SUBJECTS_with_this_task = data_loader.find_subj_list(
-        data_root=roi_root, sessions=[task]
-    )
-    if not subj_id in SUBJECTS_with_this_task:
-        print(f"subject {subj_id} not in the list of subjects with task {task}")
-        continue
-
-    ################################# LOAD FIT MEASURES #################################
-
-    ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/")
-    ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i]
-    ALL_RECORDS.sort()
-    MEASURES_fit_lst = list()
-    for s in ALL_RECORDS:
-        fit_measure = np.load(
-            f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE"
-        ).item()
-        MEASURES_fit_lst.append(fit_measure)
-    MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
-    print("fitted MEASURES loaded ...")
-
-    ################################# LOAD DATA #################################
-
-    print(
-        f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..."
-    )
-
-    BOLD = data_loader.load_TS(
-        data_root=roi_root,
-        file_name="time_series.npy",
-        SESSIONs=[task],
-        subj_id2load=subj_id,
-    )
-
-    ################################# dFC ASSESSMENT #################################
-
-    tic = time.time()
-    print("Measurement Started ...")
-
-    print("dFC estimation started...")
-    dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD)
-    print("dFC estimation done.")
-
-    print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-
-    ################################# SAVE DATA #################################
-
-    folder = f"{output_root}/{task}/{subj_id}"
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-
-    for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
-        np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC)
-
-#######################################################################################

From b3061355373dfcd5d3f89192d93779f60c6bacd7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 8 Aug 2024 12:06:33 -0400
Subject: [PATCH 104/401] add ml_utils

---
 pydfc/ml_utils.py           | 1115 +++++++++++++++++++++++++++++++++++
 pydfc/task_utils.py         |    8 +-
 task_dFC/ML.py              | 1096 +---------------------------------
 task_dFC/generate_report.py |    8 +-
 4 files changed, 1135 insertions(+), 1092 deletions(-)
 create mode 100644 pydfc/ml_utils.py

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
new file mode 100644
index 0000000..275753b
--- /dev/null
+++ b/pydfc/ml_utils.py
@@ -0,0 +1,1115 @@
+# -*- coding: utf-8 -*-
+"""
+Functions to facilitate applying ML algorithms to dFC.
+
+Created on Aug 8 2024
+@author: Mohammad Torabi
+"""
+import os
+
+import numpy as np
+from scipy.spatial import procrustes
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.manifold import SpectralEmbedding
+from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+
+from .dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm
+from .task_utils import (
+    calc_relative_task_on,
+    calc_rest_duration,
+    calc_task_duration,
+    calc_transition_freq,
+    extract_task_presence,
+)
+
+################################# Feature Loading Functions ####################################
+
+
+def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None):
+    """
+    Find the subjects that have dFC results for the given task and dFC_id (method).
+    """
+    SUBJECTS = list()
+    ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
+    ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder]
+    ALL_SUBJ_FOLDERS.sort()
+    for subj_folder in ALL_SUBJ_FOLDERS:
+        if session is None:
+            ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
+        else:
+            ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/")
+        ALL_DFC_FILES = [
+            dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file
+        ]
+        if dFC_id is not None:
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
+            ]
+        if run is not None:
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{run}_" in dFC_file
+            ]
+        if session is not None:
+            ALL_DFC_FILES = [
+                dFC_file for dFC_file in ALL_DFC_FILES if f"_{session}_" in dFC_file
+            ]
+        ALL_DFC_FILES.sort()
+        if len(ALL_DFC_FILES) > 0:
+            SUBJECTS.append(subj_folder)
+    return SUBJECTS
+
+
+def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None):
+    """
+    Load the dFC results for a given subject, task, dFC_id, run and session.
+    """
+    if session is None:
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
+            ).item()
+    else:
+        if run is None:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+        else:
+            dFC = np.load(
+                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+    return dFC
+
+
+def load_task_data(roi_root, subj, task, run=None, session=None):
+    """
+    Load the task data for a given subject, task and run.
+    """
+    if session is None:
+        if run is None:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
+            ).item()
+        else:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+    else:
+        if run is None:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+        else:
+            task_data = np.load(
+                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy",
+                allow_pickle="TRUE",
+            ).item()
+
+    return task_data
+
+
+################################# Feature Extraction Functions ####################################
+
+
+def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, dFC_root, output_root):
+    """
+    Extract task features from the event data."""
+    for session in SESSIONS:
+        task_features = {
+            "task": list(),
+            "run": list(),
+            "relative_task_on": list(),
+            "avg_task_duration": list(),
+            "var_task_duration": list(),
+            "avg_rest_duration": list(),
+            "var_rest_duration": list(),
+            "num_of_transitions": list(),
+            "relative_transition_freq": list(),
+        }
+        for task_id, task in enumerate(TASKS):
+
+            if task == "task-restingstate":
+                continue
+
+            for run in RUNS[task]:
+
+                SUBJECTS = find_available_subjects(
+                    dFC_root=dFC_root, task=task, run=run, session=session
+                )
+
+                for subj in SUBJECTS:
+                    # event data
+                    task_data = load_task_data(
+                        roi_root=roi_root, subj=subj, task=task, run=run, session=session
+                    )
+                    Fs_task = task_data["Fs_task"]
+                    TR_task = 1 / Fs_task
+
+                    task_presence = extract_task_presence(
+                        event_labels=task_data["event_labels"],
+                        TR_task=TR_task,
+                        TR_mri=task_data["TR_mri"],
+                        binary=True,
+                        binarizing_method="mean",
+                    )
+
+                    relative_task_on = calc_relative_task_on(task_presence)
+                    # task duration
+                    avg_task_duration, var_task_duration = calc_task_duration(
+                        task_presence, task_data["TR_mri"]
+                    )
+                    # rest duration
+                    avg_rest_duration, var_rest_duration = calc_rest_duration(
+                        task_presence, task_data["TR_mri"]
+                    )
+                    # freq of transitions
+                    num_of_transitions, relative_transition_freq = calc_transition_freq(
+                        task_presence
+                    )
+
+                    task_features["task"].append(task)
+                    task_features["run"].append(run)
+                    task_features["relative_task_on"].append(relative_task_on)
+                    task_features["avg_task_duration"].append(avg_task_duration)
+                    task_features["var_task_duration"].append(var_task_duration)
+                    task_features["avg_rest_duration"].append(avg_rest_duration)
+                    task_features["var_rest_duration"].append(var_rest_duration)
+                    task_features["num_of_transitions"].append(num_of_transitions)
+                    task_features["relative_transition_freq"].append(
+                        relative_transition_freq
+                    )
+
+        if session is None:
+            folder = f"{output_root}"
+        else:
+            folder = f"{output_root}/{session}"
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        np.save(f"{folder}/task_features.npy", task_features)
+
+
+def dFC_feature_extraction_subj_lvl(
+    dFC,
+    task_data,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    """
+    Extract features and target for task presence classification
+    for a single subject.
+    dynamic_pred: "no", "past", "past_and_future"
+    """
+    # dFC features
+    dFC_mat = dFC.get_dFC_mat()
+    TR_array = dFC.TR_array
+    if normalize_dFC:
+        dFC_mat = rank_norm(dFC_mat)
+    dFC_vecs = dFC_mat2vec(dFC_mat)
+
+    # event data
+    task_presence = extract_task_presence(
+        event_labels=task_data["event_labels"],
+        TR_task=1 / task_data["Fs_task"],
+        TR_mri=task_data["TR_mri"],
+        TR_array=TR_array,
+        binary=True,
+        binarizing_method="mean",
+    )
+
+    features = dFC_vecs
+    target = task_presence.ravel()
+
+    if dynamic_pred == "past":
+        # concat current TR and two TR before of features to predict the current TR of target
+        # ignore the edge case of the first two TRs
+        features = np.concatenate(
+            (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1
+        )
+        features = features[2:, :]
+        target = target[2:]
+    elif dynamic_pred == "past_and_future":
+        # concat current TR and two TR before and after of features to predict the current TR of target
+        # ignore the edge case of the first and last two TRs
+        features = np.concatenate(
+            (
+                features,
+                np.roll(features, 1, axis=0),
+                np.roll(features, 2, axis=0),
+                np.roll(features, -1, axis=0),
+                np.roll(features, -2, axis=0),
+            ),
+            axis=1,
+        )
+        features = features[2:-2, :]
+        target = target[2:-2]
+
+    return features, target
+
+
+def dFC_feature_extraction(
+    task,
+    train_subjects,
+    test_subjects,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    run=None,
+    session=None,
+    dynamic_pred="no",
+    normalize_dFC=True,
+):
+    """
+    Extract features and target for task presence classification
+    for all subjects.
+    if run is specified, dFC results for that run will be used.
+    """
+    dFC_measure_name = None
+    X_train = None
+    y_train = None
+    subj_label_train = list()
+    for subj in train_subjects:
+
+        dFC = load_dFC(
+            dFC_root=dFC_root,
+            subj=subj,
+            task=task,
+            dFC_id=dFC_id,
+            run=run,
+            session=session,
+        )
+        task_data = load_task_data(
+            roi_root=roi_root, subj=subj, task=task, run=run, session=session
+        )
+
+        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+            dFC=dFC,
+            task_data=task_data,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+
+        subj_label_train.extend([subj for i in range(X_subj.shape[0])])
+        if X_train is None and y_train is None:
+            X_train = X_subj
+            y_train = y_subj
+        else:
+            X_train = np.concatenate((X_train, X_subj), axis=0)
+            y_train = np.concatenate((y_train, y_subj), axis=0)
+
+        if dFC_measure_name is None:
+            dFC_measure_name = dFC.measure.measure_name
+        else:
+            assert (
+                dFC_measure_name == dFC.measure.measure_name
+            ), "dFC measure is not consistent."
+
+    X_test = None
+    y_test = None
+    subj_label_test = list()
+    for subj in test_subjects:
+        dFC = load_dFC(
+            dFC_root=dFC_root,
+            subj=subj,
+            task=task,
+            dFC_id=dFC_id,
+            run=run,
+            session=session,
+        )
+        task_data = load_task_data(
+            roi_root=roi_root, subj=subj, task=task, run=run, session=session
+        )
+
+        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+            dFC=dFC,
+            task_data=task_data,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+
+        subj_label_test.extend([subj for i in range(X_subj.shape[0])])
+        if X_test is None and y_test is None:
+            X_test = X_subj
+            y_test = y_subj
+        else:
+            X_test = np.concatenate((X_test, X_subj), axis=0)
+            y_test = np.concatenate((y_test, y_subj), axis=0)
+
+        if dFC_measure_name is None:
+            dFC_measure_name = dFC.measure.measure_name
+        else:
+            assert (
+                dFC_measure_name == dFC.measure.measure_name
+            ), "dFC measure is not consistent."
+
+    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
+    subj_label_train = np.array(subj_label_train)
+    subj_label_test = np.array(subj_label_test)
+
+    return (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        subj_label_train,
+        subj_label_test,
+        dFC_measure_name,
+    )
+
+
+################################# Feature Embedding Functions ####################################
+
+
+def precheck_for_procruste(X_best, X_subj):
+    """
+    Check if the two matrices have the same number of rows. if not, make them the same.
+    """
+    # for the procrustes transformation, the number of samples should be the same
+    if X_subj.shape[0] > X_best.shape[0]:
+        # add zero rows to the embedding of the best subject
+        X_best_new = np.concatenate(
+            (
+                X_best,
+                np.zeros(
+                    (
+                        X_subj.shape[0] - X_best.shape[0],
+                        X_best.shape[1],
+                    )
+                ),
+            ),
+            axis=0,
+        )
+    elif X_subj.shape[0] < X_best.shape[0]:
+        # remove extra rows from the embedding of the best subject
+        X_best_new = X_best[: X_subj.shape[0], :]
+    else:
+        X_best_new = X_best
+
+    X_best_new = X_best_new.copy()
+
+    return X_best_new
+
+
+def generalized_procrustes(X_list):
+    """
+    Generalized Procrustes Analysis
+
+    returns the mean X to be used as the reference for procrustes transformation
+    """
+    # initialize Procrustes distance
+    current_distance = 0
+
+    # initialize a mean X
+    mean_X = np.array(X_list[0])
+
+    num_X = len(X_list)
+
+    # create array for new Xs, add
+    new_Xs = np.zeros(np.array(X_list).shape)
+
+    while True:
+        # add the mean X as first element of array
+        new_Xs[0] = mean_X
+
+        # superimpose all shapes to current mean
+        for i in range(1, num_X):
+            _, new_X, _ = procrustes(mean_X, X_list[i])
+            new_Xs[i] = new_X
+
+        # calculate new mean
+        new_mean = np.mean(new_Xs, axis=0)
+
+        _, _, new_distance = procrustes(new_mean, mean_X)
+
+        # if the distance did not change, break the cycle
+        if np.abs(new_distance - current_distance) < 1e-6:
+            break
+
+        # align the new_mean to old mean
+        _, new_mean, _ = procrustes(mean_X, new_mean)
+
+        # update mean and distance
+        mean_X = new_mean
+        current_distance = new_distance
+
+    return mean_X
+
+
+def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
+    """
+    Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space.
+    """
+    affinity_matrix = kneighbors_graph(
+        X,
+        n_neighbors=n_neighbors,
+        mode="connectivity",
+        include_self=False,
+        metric=distance_metric,
+    )
+    affinity_matrix = affinity_matrix.toarray()
+    affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2)
+    LE = SpectralEmbedding(
+        n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors
+    )
+    X_embed = LE.fit_transform(X=affinity_matrix)
+    return X_embed
+
+
+def LE_embed_procustes(
+    X_train,
+    X_test,
+    y_train,
+    y_test,
+    subj_label_train,
+    subj_label_test,
+    train_subjects,
+    test_subjects,
+    n_components=30,
+    n_neighbors_LE=125,
+    procruste_method="best_SI",
+):
+    if procruste_method == "best_SI":
+        # first embed the dFC features of each subject into a lower dimensional space using LE separately
+        embed_dict = {}
+        for subject in train_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_train == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_train[subj_label_train == subject, :]
+            y_subj = y_train[subj_label_train == subject]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            SI = silhouette_score(X_subj_embed, y_subj)
+            embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
+
+        # find the best transformation based on the SI score
+        best_SI = -1
+        best_subject = None
+        for subject in embed_dict:
+            if embed_dict[subject]["SI"] > best_SI:
+                best_SI = embed_dict[subject]["SI"]
+                best_subject = subject
+
+        # apply procrustes transformation to align the embeddings of different subjects
+        # use the embeddings of the subject with the highest SI score as the reference
+        X_train_embed = None
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]["X_subj_embed"]
+            # procrustes transformation
+            if subject == best_subject:
+                X_subj_embed_transformed = X_subj_embed
+            else:
+                # for the procrustes transformation, the number of samples should be the same
+                X_best_subj_embed = precheck_for_procruste(
+                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+                )
+                _, X_subj_embed_transformed, _ = procrustes(
+                    X_best_subj_embed, X_subj_embed
+                )
+            if X_train_embed is None:
+                X_train_embed = X_subj_embed_transformed
+            else:
+                X_train_embed = np.concatenate(
+                    (X_train_embed, X_subj_embed_transformed), axis=0
+                )
+
+        # apply the same transformation to the test set
+        X_test_embed = None
+        for subject in test_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_test == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_test[subj_label_test == subject, :]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            # procrustes transformation
+            # for the procrustes transformation, the number of samples should be the same
+            X_best_subj_embed = precheck_for_procruste(
+                embed_dict[best_subject]["X_subj_embed"], X_subj_embed
+            )
+            _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed)
+            if X_test_embed is None:
+                X_test_embed = X_subj_embed_transformed
+            else:
+                X_test_embed = np.concatenate(
+                    (X_test_embed, X_subj_embed_transformed), axis=0
+                )
+
+    elif procruste_method == "generalized":
+        # in this method we use generalized procrustes analysis to align the embeddings of different subjects
+        # first embed the dFC features of each subject into a lower dimensional space using LE separately
+        embed_dict = {}
+        for subject in train_subjects:
+            # assert the samples of the same subject are contiguous
+            assert np.all(
+                np.diff(np.where(subj_label_train == subject)[0]) == 1
+            ), f"Indices of {subject} are not consecutive"
+            X_subj = X_train[subj_label_train == subject, :]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            embed_dict[subject] = X_subj_embed
+
+        # then find the max number of samples among all subjects
+        max_samples = 0
+        for subject in train_subjects:
+            if embed_dict[subject].shape[0] > max_samples:
+                max_samples = embed_dict[subject].shape[0]
+
+        # find the mean embedding of all subjects to use as the reference for procrustes transformation
+        X_train_list = []
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]
+            # add zero rows to the embedding of the subject with less samples
+            if X_subj_embed.shape[0] < max_samples:
+                X_subj_embed_new = np.concatenate(
+                    (
+                        X_subj_embed,
+                        np.zeros(
+                            (
+                                max_samples - X_subj_embed.shape[0],
+                                X_subj_embed.shape[1],
+                            )
+                        ),
+                    ),
+                    axis=0,
+                )
+            else:
+                X_subj_embed_new = X_subj_embed
+            X_train_list.append(X_subj_embed_new)
+        mean_X_train = generalized_procrustes(X_train_list)
+
+        X_train_embed = None
+        for subject in train_subjects:
+            X_subj_embed = embed_dict[subject]
+            mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
+            _, X_subj_embed_transformed, _ = procrustes(
+                mean_X_train_new_size, X_subj_embed
+            )
+            if X_train_embed is None:
+                X_train_embed = X_subj_embed_transformed
+            else:
+                X_train_embed = np.concatenate(
+                    (X_train_embed, X_subj_embed_transformed), axis=0
+                )
+
+        X_test_embed = None
+        for subject in test_subjects:
+            X_subj = X_test[subj_label_test == subject, :]
+            X_subj_embed = LE_transform(
+                X=X_subj,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                distance_metric="correlation",
+            )
+            mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
+            _, X_subj_embed_transformed, _ = procrustes(
+                mean_X_train_new_size, X_subj_embed
+            )
+            if X_test_embed is None:
+                X_test_embed = X_subj_embed_transformed
+            else:
+                X_test_embed = np.concatenate(
+                    (X_test_embed, X_subj_embed_transformed), axis=0
+                )
+
+    return X_train_embed, X_test_embed
+
+
+def embed_dFC_features(
+    train_subjects,
+    test_subjects,
+    X_train,
+    X_test,
+    y_train,
+    y_test,
+    subj_label_train,
+    subj_label_test,
+    embedding="PCA",
+    n_components=30,
+    n_neighbors_LE=125,
+    LE_embedding_method="concat+embed",
+):
+    """
+    Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
+
+    for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects.
+    All the subjects are transformed into the space of the subject with the highest silhouette score.
+
+    LE_embedding_method: "concat+embed" or "embed+procrustes"
+    """
+    if embedding == "PCA":
+        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
+        pca.fit(X_train)
+        X_train_embed = pca.transform(X_train)
+        if X_test is not None:
+            X_test_embed = pca.transform(X_test)
+        else:
+            X_test_embed = None
+    elif embedding == "LE":
+        if LE_embedding_method == "embed+procrustes":
+            X_train_embed, X_test_embed = LE_embed_procustes(
+                X_train=X_train,
+                X_test=X_test,
+                y_train=y_train,
+                y_test=y_test,
+                subj_label_train=subj_label_train,
+                subj_label_test=subj_label_test,
+                train_subjects=train_subjects,
+                test_subjects=test_subjects,
+                n_components=n_components,
+                n_neighbors_LE=n_neighbors_LE,
+                procruste_method="generalized",
+            )
+        elif LE_embedding_method == "concat+embed":
+            # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data
+            if X_test is not None:
+                X_concat = np.concatenate((X_train, X_test), axis=0)
+            else:
+                X_concat = X_train
+            X_concat_embed = LE_transform(
+                X=X_concat,
+                n_components=n_components,
+                n_neighbors=min(n_neighbors_LE, X_concat.shape[0]),
+                distance_metric="correlation",
+            )
+            X_train_embed = X_concat_embed[: X_train.shape[0], :]
+            if X_test is not None:
+                X_test_embed = X_concat_embed[X_train.shape[0] :, :]
+            else:
+                X_test_embed = None
+
+    return X_train_embed, X_test_embed
+
+
+################################# Classification Framework Functions ####################################
+
+
+def logistic_regression_classify(X_train, y_train, X_test, y_test):
+    """
+    Logistic regression classification
+    """
+    # create a pipeline with a logistic regression model to find the best C
+    logistic_reg = make_pipeline(
+        StandardScaler(), LogisticRegression(penalty="l1", solver="saga")
+    )
+    # create a dictionary of all values we want to test for C
+    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+    # use gridsearch to test all values for C
+    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
+    # fit model to data
+    lr_gscv.fit(X_train, y_train)
+
+    C = lr_gscv.best_params_["logisticregression__C"]
+
+    log_reg = make_pipeline(
+        StandardScaler(),
+        LogisticRegression(penalty="l1", C=C, solver="saga"),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "log_reg_model": log_reg,
+        "log_reg_C": C,
+        "log_reg_train_score": log_reg.score(X_train, y_train),
+        "log_reg_test_score": log_reg.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
+def KNN_classify(X_train, y_train, X_test, y_test):
+    """
+    KNN classification
+    """
+    # create a pipeline with a knn model to find the best n_neighbors
+    knn = make_pipeline(
+        StandardScaler(),
+        KNeighborsClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_neighbors
+    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
+    # use gridsearch to test all values for n_neighbors
+    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
+    # fit model to data
+    knn_gscv.fit(X_train, y_train)
+
+    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
+
+    neigh = make_pipeline(
+        StandardScaler(),
+        KNeighborsClassifier(n_neighbors=n_neighbors),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "KNN_cv_results": knn_gscv.cv_results_,
+        "KNN_model": neigh,
+        "KNN_train_score": neigh.score(X_train, y_train),
+        "KNN_test_score": neigh.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
+def random_forest_classify(X_train, y_train, X_test, y_test):
+    """
+    Random Forest classification
+    """
+    # create a pipeline with a random forest model to find the best n_estimators
+    rf = make_pipeline(
+        StandardScaler(),
+        RandomForestClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_estimators
+    param_grid = {
+        "randomforestclassifier__n_estimators": [10, 50, 100, 200],
+        "randomforestclassifier__max_depth": [None, 5, 10, 20, 30],
+    }
+    # use gridsearch to test all values for n_estimators
+    rf_gscv = GridSearchCV(rf, param_grid, cv=5)
+    # fit model to data
+    rf_gscv.fit(X_train, y_train)
+
+    n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"]
+    max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"]
+
+    rf = make_pipeline(
+        StandardScaler(),
+        RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "RF_cv_results": rf_gscv.cv_results_,
+        "RF_model": rf,
+        "RF_train_score": rf.score(X_train, y_train),
+        "RF_test_score": rf.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
+def gradient_boosting_classify(X_train, y_train, X_test, y_test):
+    """
+    Gradient Boosting classification
+    """
+    # create a pipeline with a gradient boosting model to find the best n_estimators
+    gb = make_pipeline(
+        StandardScaler(),
+        GradientBoostingClassifier(),
+    )
+    # create a dictionary of all values we want to test for n_estimators
+    param_grid = {
+        "gradientboostingclassifier__n_estimators": [10, 50, 100, 200],
+        "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2],
+        "gradientboostingclassifier__max_depth": [3, 5, 10],
+    }
+    # use gridsearch to test all values for n_estimators
+    gb_gscv = GridSearchCV(gb, param_grid, cv=5)
+    # fit model to data
+    gb_gscv.fit(X_train, y_train)
+
+    n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"]
+    learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"]
+    max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"]
+
+    gb = make_pipeline(
+        StandardScaler(),
+        GradientBoostingClassifier(
+            n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate
+        ),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "GB_cv_results": gb_gscv.cv_results_,
+        "GB_model": gb,
+        "GB_train_score": gb.score(X_train, y_train),
+        "GB_test_score": gb.score(X_test, y_test),
+    }
+
+    return RESULT
+
+
+def task_presence_classification(
+    task,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    run=None,
+    session=None,
+    dynamic_pred="no",
+    normalize_dFC=True,
+    train_test_ratio=0.8,
+):
+    """
+    perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting
+    for a given task and dFC method and run.
+    """
+    if run is None:
+        print(f"=============== {task} ===============")
+    else:
+        print(f"=============== {task} {run} ===============")
+
+    if task == "task-restingstate":
+        return
+
+    SUBJECTS = find_available_subjects(
+        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
+    )
+
+    # randomly select train_test_ratio of the subjects for training
+    # and rest for testing using numpy.random.choice
+    train_subjects = np.random.choice(
+        SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False
+    )
+    test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
+    print(
+        f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
+    )
+
+    X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = (
+        dFC_feature_extraction(
+            task=task,
+            train_subjects=train_subjects,
+            test_subjects=test_subjects,
+            dFC_id=dFC_id,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            run=run,
+            session=session,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+    )
+
+    # embed dFC features
+    X_train, X_test = embed_dFC_features(
+        train_subjects=train_subjects,
+        test_subjects=test_subjects,
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        subj_label_train=subj_label_train,
+        subj_label_test=subj_label_test,
+        embedding="LE",
+        n_components=30,
+        n_neighbors_LE=125,
+        LE_embedding_method="embed+procrustes",
+    )
+
+    # task presence classification
+
+    print("task presence classification ...")
+
+    # logistic regression
+    log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test)
+
+    # KNN
+    KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test)
+
+    # # Random Forest
+    # RF_RESULT = random_forest_classify(
+    #     X_train, y_train, X_test, y_test
+    # )
+
+    # # Gradient Boosting
+    # GBT_RESULT = gradient_boosting_classify(
+    #     X_train, y_train, X_test, y_test
+    # )
+
+    ML_RESULT = {}
+    for key in log_reg_RESULT:
+        ML_RESULT[key] = log_reg_RESULT[key]
+    for key in KNN_RESULT:
+        ML_RESULT[key] = KNN_RESULT[key]
+    # for key in RF_RESULT:
+    #     ML_RESULT[key] = RF_RESULT[key]
+    # for key in GBT_RESULT:
+    #     ML_RESULT[key] = GBT_RESULT[key]
+
+    # measure pred score on each subj
+
+    ML_scores = {
+        "subj_id": list(),
+        "group": list(),
+        "task": list(),
+        "run": list(),
+        "dFC method": list(),
+        "Logistic regression accuracy": list(),
+        "KNN accuracy": list(),
+        # "Random Forest accuracy": list(),
+        # "Gradient Boosting accuracy": list(),
+    }
+    log_reg = log_reg_RESULT["log_reg_model"]
+    KNN = KNN_RESULT["KNN_model"]
+    # RF = RF_RESULT["RF_model"]
+    # GBT = GBT_RESULT["GB_model"]
+
+    for subj in SUBJECTS:
+        ML_scores["subj_id"].append(subj)
+        if subj in train_subjects:
+            ML_scores["group"].append("train")
+            features = X_train[subj_label_train == subj, :]
+            target = y_train[subj_label_train == subj]
+        elif subj in test_subjects:
+            ML_scores["group"].append("test")
+            features = X_test[subj_label_test == subj, :]
+            target = y_test[subj_label_test == subj]
+
+        pred_lr = log_reg.predict(features)
+        pred_KNN = KNN.predict(features)
+        # pred_RF = RF.predict(features)
+        # pred_GBT = GBT.predict(features)
+
+        ML_scores["Logistic regression accuracy"].append(
+            balanced_accuracy_score(target, pred_lr)
+        )
+        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
+        # ML_scores["Random Forest accuracy"].append(
+        #     balanced_accuracy_score(target, pred_RF)
+        # )
+        # ML_scores["Gradient Boosting accuracy"].append(
+        #     balanced_accuracy_score(target, pred_GBT)
+        # )
+
+        ML_scores["task"].append(task)
+        ML_scores["run"].append(run)
+        ML_scores["dFC method"].append(measure_name)
+
+    return ML_RESULT, ML_scores
+
+
+################################# Clustering Framework Functions ####################################
+
+
+def task_presence_clustering(
+    task,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    run=None,
+    session=None,
+    normalize_dFC=True,
+):
+    if run is None:
+        print(f"=============== {task} ===============")
+    else:
+        print(f"=============== {task} {run} ===============")
+
+    if task == "task-restingstate":
+        return
+
+    SUBJECTS = find_available_subjects(
+        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
+    )
+
+    print(f"Number of subjects: {len(SUBJECTS)}")
+
+    X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction(
+        task=task,
+        train_subjects=SUBJECTS,
+        test_subjects=[],
+        dFC_id=dFC_id,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        run=run,
+        session=session,
+        dynamic_pred="no",
+        normalize_dFC=normalize_dFC,
+    )
+
+    # embed dFC features
+    X, _ = embed_dFC_features(
+        train_subjects=SUBJECTS,
+        test_subjects=[],
+        X_train=X,
+        X_test=None,
+        y_train=y,
+        y_test=None,
+        subj_label_train=subj_label,
+        subj_label_test=None,
+        embedding="LE",
+        n_components=30,
+        n_neighbors_LE=125,
+        LE_embedding_method="embed+procrustes",
+    )
+
+    # clustering
+    # apply kmeans clustering to dFC features
+
+    n_clusters = 2  # corresponding to task and rest
+
+    scaler = StandardScaler()
+    X_normalized = scaler.fit_transform(X)
+    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
+    labels_pred = kmeans.fit_predict(X_normalized)
+
+    # ARI score
+    print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
+
+    # # visualize clustering centroids
+    # centroids = kmeans.cluster_centers_
+    # centroids = pca.inverse_transform(centroids)
+    # centroids = scaler.inverse_transform(centroids)
+    # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+    # centroids_mat = dFC_vec2mat(centroids, n_regions)
+
+    clustering_RESULTS = {
+        "StandardScaler": scaler,
+        "kmeans": kmeans,
+        "ARI": adjusted_rand_score(y, labels_pred),
+        # "centroids": centroids_mat,
+    }
+
+    clustering_scores = {
+        "subj_id": list(),
+        "task": list(),
+        "run": list(),
+        "dFC method": list(),
+        "Kmeans ARI": list(),
+        "SI": list(),
+    }
+    for subj in SUBJECTS:
+        clustering_scores["subj_id"].append(subj)
+        features = X[subj_label == subj, :]
+        target = y[subj_label == subj]
+
+        features_normalized = scaler.transform(features)
+        pred_kmeans = kmeans.predict(features_normalized)
+
+        clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans))
+
+        # silhouette score in terms of separability of original labels, not the clustering labels
+        clustering_scores["SI"].append(silhouette_score(features, target))
+
+        clustering_scores["task"].append(task)
+        clustering_scores["run"].append(run)
+        clustering_scores["dFC method"].append(measure_name)
+
+    return clustering_RESULTS, clustering_scores
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 4dedc52..8a11cbf 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -314,7 +314,7 @@ def extract_task_presence(
 ################################# Task Features ####################################
 
 
-def relative_task_on(task_presence):
+def calc_relative_task_on(task_presence):
     """
     task_presence: 0, 1 array
     return: relative_task_on
@@ -322,7 +322,7 @@ def relative_task_on(task_presence):
     return np.sum(task_presence) / len(task_presence)
 
 
-def task_duration(task_presence, TR_mri):
+def calc_task_duration(task_presence, TR_mri):
     """
     task_presence: 0, 1 array
     return: avg_task_duration, var_task_duration
@@ -339,7 +339,7 @@ def task_duration(task_presence, TR_mri):
     return np.mean(task_durations), np.var(task_durations)
 
 
-def rest_duration(task_presence, TR_mri):
+def calc_rest_duration(task_presence, TR_mri):
     """
     task_presence: 0, 1 array
     return: avg_rest_duration, var_rest_duration
@@ -361,7 +361,7 @@ def rest_duration(task_presence, TR_mri):
     return np.mean(rest_durations), np.var(rest_durations)
 
 
-def transition_freq(task_presence):
+def calc_transition_freq(task_presence):
     """
     task_presence: 0, 1 array
     return: num_of_transitions, relative_transition_freq
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 8c16cca..1fb7bd1 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -4,1095 +4,22 @@
 import traceback
 
 import numpy as np
-from scipy.spatial import procrustes
 from sklearn.cluster import KMeans
-from sklearn.decomposition import PCA
-from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
-from sklearn.linear_model import LogisticRegression
-from sklearn.manifold import SpectralEmbedding
-from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
-from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
-from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
-from sklearn.pipeline import make_pipeline
+from sklearn.metrics import adjusted_rand_score, silhouette_score
 from sklearn.preprocessing import StandardScaler
 
-from pydfc import DFC, data_loader, task_utils
-from pydfc.dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm
+from pydfc.ml_utils import (
+    dFC_feature_extraction,
+    embed_dFC_features,
+    extract_task_features,
+    find_available_subjects,
+    task_presence_classification,
+    task_presence_clustering,
+)
 
 #######################################################################################
 
 
-def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None):
-    """
-    Find the subjects that have dFC results for the given task and dFC_id (method).
-    """
-    SUBJECTS = list()
-    ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
-    ALL_SUBJ_FOLDERS = [folder for folder in ALL_SUBJ_FOLDERS if "sub-" in folder]
-    ALL_SUBJ_FOLDERS.sort()
-    for subj_folder in ALL_SUBJ_FOLDERS:
-        if session is None:
-            ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
-        else:
-            ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/")
-        ALL_DFC_FILES = [
-            dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file
-        ]
-        if dFC_id is not None:
-            ALL_DFC_FILES = [
-                dFC_file for dFC_file in ALL_DFC_FILES if f"_{dFC_id}.npy" in dFC_file
-            ]
-        if run is not None:
-            ALL_DFC_FILES = [
-                dFC_file for dFC_file in ALL_DFC_FILES if f"_{run}_" in dFC_file
-            ]
-        if session is not None:
-            ALL_DFC_FILES = [
-                dFC_file for dFC_file in ALL_DFC_FILES if f"_{session}_" in dFC_file
-            ]
-        ALL_DFC_FILES.sort()
-        if len(ALL_DFC_FILES) > 0:
-            SUBJECTS.append(subj_folder)
-    return SUBJECTS
-
-
-def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, output_root):
-    """
-    Extract task features from the event data."""
-    for session in SESSIONS:
-        task_features = {
-            "task": list(),
-            "run": list(),
-            "relative_task_on": list(),
-            "avg_task_duration": list(),
-            "var_task_duration": list(),
-            "avg_rest_duration": list(),
-            "var_rest_duration": list(),
-            "num_of_transitions": list(),
-            "relative_transition_freq": list(),
-        }
-        for task_id, task in enumerate(TASKS):
-
-            if task == "task-restingstate":
-                continue
-
-            for run in RUNS[task]:
-
-                SUBJECTS = find_available_subjects(
-                    dFC_root=dFC_root, task=task, run=run, session=session
-                )
-
-                for subj in SUBJECTS:
-                    # event data
-                    task_data = load_task_data(
-                        roi_root=roi_root, subj=subj, task=task, run=run, session=session
-                    )
-                    Fs_task = task_data["Fs_task"]
-                    TR_task = 1 / Fs_task
-
-                    task_presence = task_utils.extract_task_presence(
-                        event_labels=task_data["event_labels"],
-                        TR_task=TR_task,
-                        TR_mri=task_data["TR_mri"],
-                        binary=True,
-                        binarizing_method="mean",
-                    )
-
-                    relative_task_on = task_utils.relative_task_on(task_presence)
-                    # task duration
-                    avg_task_duration, var_task_duration = task_utils.task_duration(
-                        task_presence, task_data["TR_mri"]
-                    )
-                    # rest duration
-                    avg_rest_duration, var_rest_duration = task_utils.rest_duration(
-                        task_presence, task_data["TR_mri"]
-                    )
-                    # freq of transitions
-                    num_of_transitions, relative_transition_freq = (
-                        task_utils.transition_freq(task_presence)
-                    )
-
-                    task_features["task"].append(task)
-                    task_features["run"].append(run)
-                    task_features["relative_task_on"].append(relative_task_on)
-                    task_features["avg_task_duration"].append(avg_task_duration)
-                    task_features["var_task_duration"].append(var_task_duration)
-                    task_features["avg_rest_duration"].append(avg_rest_duration)
-                    task_features["var_rest_duration"].append(var_rest_duration)
-                    task_features["num_of_transitions"].append(num_of_transitions)
-                    task_features["relative_transition_freq"].append(
-                        relative_transition_freq
-                    )
-
-        if session is None:
-            folder = f"{output_root}"
-        else:
-            folder = f"{output_root}/{session}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-        np.save(f"{folder}/task_features.npy", task_features)
-
-
-def dFC_feature_extraction_subj_lvl(
-    dFC,
-    task_data,
-    dynamic_pred="no",
-    normalize_dFC=True,
-):
-    """
-    Extract features and target for task presence classification
-    for a single subject.
-    dynamic_pred: "no", "past", "past_and_future"
-    """
-    # dFC features
-    dFC_mat = dFC.get_dFC_mat()
-    TR_array = dFC.TR_array
-    if normalize_dFC:
-        dFC_mat = rank_norm(dFC_mat)
-    dFC_vecs = dFC_mat2vec(dFC_mat)
-
-    # event data
-    task_presence = task_utils.extract_task_presence(
-        event_labels=task_data["event_labels"],
-        TR_task=1 / task_data["Fs_task"],
-        TR_mri=task_data["TR_mri"],
-        TR_array=TR_array,
-        binary=True,
-        binarizing_method="mean",
-    )
-
-    features = dFC_vecs
-    target = task_presence.ravel()
-
-    if dynamic_pred == "past":
-        # concat current TR and two TR before of features to predict the current TR of target
-        # ignore the edge case of the first two TRs
-        features = np.concatenate(
-            (features, np.roll(features, 1, axis=0), np.roll(features, 2, axis=0)), axis=1
-        )
-        features = features[2:, :]
-        target = target[2:]
-    elif dynamic_pred == "past_and_future":
-        # concat current TR and two TR before and after of features to predict the current TR of target
-        # ignore the edge case of the first and last two TRs
-        features = np.concatenate(
-            (
-                features,
-                np.roll(features, 1, axis=0),
-                np.roll(features, 2, axis=0),
-                np.roll(features, -1, axis=0),
-                np.roll(features, -2, axis=0),
-            ),
-            axis=1,
-        )
-        features = features[2:-2, :]
-        target = target[2:-2]
-
-    return features, target
-
-
-def load_dFC(dFC_root, subj, task, dFC_id, run=None, session=None):
-    """
-    Load the dFC results for a given subject, task, dFC_id, run and session.
-    """
-    if session is None:
-        if run is None:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/dFC_{task}_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-        else:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/dFC_{task}_{run}_{dFC_id}.npy", allow_pickle="TRUE"
-            ).item()
-    else:
-        if run is None:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{dFC_id}.npy",
-                allow_pickle="TRUE",
-            ).item()
-        else:
-            dFC = np.load(
-                f"{dFC_root}/{subj}/{session}/dFC_{session}_{task}_{run}_{dFC_id}.npy",
-                allow_pickle="TRUE",
-            ).item()
-
-    return dFC
-
-
-def load_task_data(roi_root, subj, task, run=None, session=None):
-    """
-    Load the task data for a given subject, task and run.
-    """
-    if session is None:
-        if run is None:
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_task-data.npy", allow_pickle="TRUE"
-            ).item()
-        else:
-            task_data = np.load(
-                f"{roi_root}/{subj}/{subj}_{task}_{run}_task-data.npy",
-                allow_pickle="TRUE",
-            ).item()
-    else:
-        if run is None:
-            task_data = np.load(
-                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_task-data.npy",
-                allow_pickle="TRUE",
-            ).item()
-        else:
-            task_data = np.load(
-                f"{roi_root}/{subj}/{session}/{subj}_{session}_{task}_{run}_task-data.npy",
-                allow_pickle="TRUE",
-            ).item()
-
-    return task_data
-
-
-def precheck_for_procruste(X_best, X_subj):
-    """
-    Check if the two matrices have the same number of rows. if not, make them the same.
-    """
-    # for the procrustes transformation, the number of samples should be the same
-    if X_subj.shape[0] > X_best.shape[0]:
-        # add zero rows to the embedding of the best subject
-        X_best_new = np.concatenate(
-            (
-                X_best,
-                np.zeros(
-                    (
-                        X_subj.shape[0] - X_best.shape[0],
-                        X_best.shape[1],
-                    )
-                ),
-            ),
-            axis=0,
-        )
-    elif X_subj.shape[0] < X_best.shape[0]:
-        # remove extra rows from the embedding of the best subject
-        X_best_new = X_best[: X_subj.shape[0], :]
-    else:
-        X_best_new = X_best
-
-    X_best_new = X_best_new.copy()
-
-    return X_best_new
-
-
-def generalized_procrustes(X_list):
-    """
-    Generalized Procrustes Analysis
-
-    returns the mean X to be used as the reference for procrustes transformation
-    """
-    # initialize Procrustes distance
-    current_distance = 0
-
-    # initialize a mean X
-    mean_X = np.array(X_list[0])
-
-    num_X = len(X_list)
-
-    # create array for new Xs, add
-    new_Xs = np.zeros(np.array(X_list).shape)
-
-    while True:
-        # add the mean X as first element of array
-        new_Xs[0] = mean_X
-
-        # superimpose all shapes to current mean
-        for i in range(1, num_X):
-            _, new_X, _ = procrustes(mean_X, X_list[i])
-            new_Xs[i] = new_X
-
-        # calculate new mean
-        new_mean = np.mean(new_Xs, axis=0)
-
-        _, _, new_distance = procrustes(new_mean, mean_X)
-
-        # if the distance did not change, break the cycle
-        if np.abs(new_distance - current_distance) < 1e-6:
-            break
-
-        # align the new_mean to old mean
-        _, new_mean, _ = procrustes(mean_X, new_mean)
-
-        # update mean and distance
-        mean_X = new_mean
-        current_distance = new_distance
-
-    return mean_X
-
-
-def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
-    """
-    Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space.
-    """
-    affinity_matrix = kneighbors_graph(
-        X,
-        n_neighbors=n_neighbors,
-        mode="connectivity",
-        include_self=False,
-        metric=distance_metric,
-    )
-    affinity_matrix = affinity_matrix.toarray()
-    affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2)
-    LE = SpectralEmbedding(
-        n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors
-    )
-    X_embed = LE.fit_transform(X=affinity_matrix)
-    return X_embed
-
-
-def LE_embed_procustes(
-    X_train,
-    X_test,
-    y_train,
-    y_test,
-    subj_label_train,
-    subj_label_test,
-    train_subjects,
-    test_subjects,
-    n_components=30,
-    n_neighbors_LE=125,
-    procruste_method="best_SI",
-):
-    if procruste_method == "best_SI":
-        # first embed the dFC features of each subject into a lower dimensional space using LE separately
-        embed_dict = {}
-        for subject in train_subjects:
-            # assert the samples of the same subject are contiguous
-            assert np.all(
-                np.diff(np.where(subj_label_train == subject)[0]) == 1
-            ), f"Indices of {subject} are not consecutive"
-            X_subj = X_train[subj_label_train == subject, :]
-            y_subj = y_train[subj_label_train == subject]
-            X_subj_embed = LE_transform(
-                X=X_subj,
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-                distance_metric="correlation",
-            )
-            SI = silhouette_score(X_subj_embed, y_subj)
-            embed_dict[subject] = {"X_subj_embed": X_subj_embed, "SI": SI}
-
-        # find the best transformation based on the SI score
-        best_SI = -1
-        best_subject = None
-        for subject in embed_dict:
-            if embed_dict[subject]["SI"] > best_SI:
-                best_SI = embed_dict[subject]["SI"]
-                best_subject = subject
-
-        # apply procrustes transformation to align the embeddings of different subjects
-        # use the embeddings of the subject with the highest SI score as the reference
-        X_train_embed = None
-        for subject in train_subjects:
-            X_subj_embed = embed_dict[subject]["X_subj_embed"]
-            # procrustes transformation
-            if subject == best_subject:
-                X_subj_embed_transformed = X_subj_embed
-            else:
-                # for the procrustes transformation, the number of samples should be the same
-                X_best_subj_embed = precheck_for_procruste(
-                    embed_dict[best_subject]["X_subj_embed"], X_subj_embed
-                )
-                _, X_subj_embed_transformed, _ = procrustes(
-                    X_best_subj_embed, X_subj_embed
-                )
-            if X_train_embed is None:
-                X_train_embed = X_subj_embed_transformed
-            else:
-                X_train_embed = np.concatenate(
-                    (X_train_embed, X_subj_embed_transformed), axis=0
-                )
-
-        # apply the same transformation to the test set
-        X_test_embed = None
-        for subject in test_subjects:
-            # assert the samples of the same subject are contiguous
-            assert np.all(
-                np.diff(np.where(subj_label_test == subject)[0]) == 1
-            ), f"Indices of {subject} are not consecutive"
-            X_subj = X_test[subj_label_test == subject, :]
-            X_subj_embed = LE_transform(
-                X=X_subj,
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-                distance_metric="correlation",
-            )
-            # procrustes transformation
-            # for the procrustes transformation, the number of samples should be the same
-            X_best_subj_embed = precheck_for_procruste(
-                embed_dict[best_subject]["X_subj_embed"], X_subj_embed
-            )
-            _, X_subj_embed_transformed, _ = procrustes(X_best_subj_embed, X_subj_embed)
-            if X_test_embed is None:
-                X_test_embed = X_subj_embed_transformed
-            else:
-                X_test_embed = np.concatenate(
-                    (X_test_embed, X_subj_embed_transformed), axis=0
-                )
-
-    elif procruste_method == "generalized":
-        # in this method we use generalized procrustes analysis to align the embeddings of different subjects
-        # first embed the dFC features of each subject into a lower dimensional space using LE separately
-        embed_dict = {}
-        for subject in train_subjects:
-            # assert the samples of the same subject are contiguous
-            assert np.all(
-                np.diff(np.where(subj_label_train == subject)[0]) == 1
-            ), f"Indices of {subject} are not consecutive"
-            X_subj = X_train[subj_label_train == subject, :]
-            X_subj_embed = LE_transform(
-                X=X_subj,
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-                distance_metric="correlation",
-            )
-            embed_dict[subject] = X_subj_embed
-
-        # then find the max number of samples among all subjects
-        max_samples = 0
-        for subject in train_subjects:
-            if embed_dict[subject].shape[0] > max_samples:
-                max_samples = embed_dict[subject].shape[0]
-
-        # find the mean embedding of all subjects to use as the reference for procrustes transformation
-        X_train_list = []
-        for subject in train_subjects:
-            X_subj_embed = embed_dict[subject]
-            # add zero rows to the embedding of the subject with less samples
-            if X_subj_embed.shape[0] < max_samples:
-                X_subj_embed_new = np.concatenate(
-                    (
-                        X_subj_embed,
-                        np.zeros(
-                            (
-                                max_samples - X_subj_embed.shape[0],
-                                X_subj_embed.shape[1],
-                            )
-                        ),
-                    ),
-                    axis=0,
-                )
-            else:
-                X_subj_embed_new = X_subj_embed
-            X_train_list.append(X_subj_embed_new)
-        mean_X_train = generalized_procrustes(X_train_list)
-
-        X_train_embed = None
-        for subject in train_subjects:
-            X_subj_embed = embed_dict[subject]
-            mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
-            _, X_subj_embed_transformed, _ = procrustes(
-                mean_X_train_new_size, X_subj_embed
-            )
-            if X_train_embed is None:
-                X_train_embed = X_subj_embed_transformed
-            else:
-                X_train_embed = np.concatenate(
-                    (X_train_embed, X_subj_embed_transformed), axis=0
-                )
-
-        X_test_embed = None
-        for subject in test_subjects:
-            X_subj = X_test[subj_label_test == subject, :]
-            X_subj_embed = LE_transform(
-                X=X_subj,
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
-                distance_metric="correlation",
-            )
-            mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
-            _, X_subj_embed_transformed, _ = procrustes(
-                mean_X_train_new_size, X_subj_embed
-            )
-            if X_test_embed is None:
-                X_test_embed = X_subj_embed_transformed
-            else:
-                X_test_embed = np.concatenate(
-                    (X_test_embed, X_subj_embed_transformed), axis=0
-                )
-
-    return X_train_embed, X_test_embed
-
-
-def embed_dFC_features(
-    train_subjects,
-    test_subjects,
-    X_train,
-    X_test,
-    y_train,
-    y_test,
-    subj_label_train,
-    subj_label_test,
-    embedding="PCA",
-    n_components=30,
-    n_neighbors_LE=125,
-    LE_embedding_method="concat+embed",
-):
-    """
-    Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
-
-    for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects.
-    All the subjects are transformed into the space of the subject with the highest silhouette score.
-
-    LE_embedding_method: "concat+embed" or "embed+procrustes"
-    """
-    if embedding == "PCA":
-        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
-        pca.fit(X_train)
-        X_train_embed = pca.transform(X_train)
-        if X_test is not None:
-            X_test_embed = pca.transform(X_test)
-        else:
-            X_test_embed = None
-    elif embedding == "LE":
-        if LE_embedding_method == "embed+procrustes":
-            X_train_embed, X_test_embed = LE_embed_procustes(
-                X_train=X_train,
-                X_test=X_test,
-                y_train=y_train,
-                y_test=y_test,
-                subj_label_train=subj_label_train,
-                subj_label_test=subj_label_test,
-                train_subjects=train_subjects,
-                test_subjects=test_subjects,
-                n_components=n_components,
-                n_neighbors_LE=n_neighbors_LE,
-                procruste_method="generalized",
-            )
-        elif LE_embedding_method == "concat+embed":
-            # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data
-            if X_test is not None:
-                X_concat = np.concatenate((X_train, X_test), axis=0)
-            else:
-                X_concat = X_train
-            X_concat_embed = LE_transform(
-                X=X_concat,
-                n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_concat.shape[0]),
-                distance_metric="correlation",
-            )
-            X_train_embed = X_concat_embed[: X_train.shape[0], :]
-            if X_test is not None:
-                X_test_embed = X_concat_embed[X_train.shape[0] :, :]
-            else:
-                X_test_embed = None
-
-    return X_train_embed, X_test_embed
-
-
-def dFC_feature_extraction(
-    task,
-    train_subjects,
-    test_subjects,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    run=None,
-    session=None,
-    dynamic_pred="no",
-    normalize_dFC=True,
-):
-    """
-    Extract features and target for task presence classification
-    for all subjects.
-    if run is specified, dFC results for that run will be used.
-    """
-    dFC_measure_name = None
-    X_train = None
-    y_train = None
-    subj_label_train = list()
-    for subj in train_subjects:
-
-        dFC = load_dFC(
-            dFC_root=dFC_root,
-            subj=subj,
-            task=task,
-            dFC_id=dFC_id,
-            run=run,
-            session=session,
-        )
-        task_data = load_task_data(
-            roi_root=roi_root, subj=subj, task=task, run=run, session=session
-        )
-
-        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
-            dFC=dFC,
-            task_data=task_data,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-        )
-
-        subj_label_train.extend([subj for i in range(X_subj.shape[0])])
-        if X_train is None and y_train is None:
-            X_train = X_subj
-            y_train = y_subj
-        else:
-            X_train = np.concatenate((X_train, X_subj), axis=0)
-            y_train = np.concatenate((y_train, y_subj), axis=0)
-
-        if dFC_measure_name is None:
-            dFC_measure_name = dFC.measure.measure_name
-        else:
-            assert (
-                dFC_measure_name == dFC.measure.measure_name
-            ), "dFC measure is not consistent."
-
-    X_test = None
-    y_test = None
-    subj_label_test = list()
-    for subj in test_subjects:
-        dFC = load_dFC(
-            dFC_root=dFC_root,
-            subj=subj,
-            task=task,
-            dFC_id=dFC_id,
-            run=run,
-            session=session,
-        )
-        task_data = load_task_data(
-            roi_root=roi_root, subj=subj, task=task, run=run, session=session
-        )
-
-        X_subj, y_subj = dFC_feature_extraction_subj_lvl(
-            dFC=dFC,
-            task_data=task_data,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-        )
-
-        subj_label_test.extend([subj for i in range(X_subj.shape[0])])
-        if X_test is None and y_test is None:
-            X_test = X_subj
-            y_test = y_subj
-        else:
-            X_test = np.concatenate((X_test, X_subj), axis=0)
-            y_test = np.concatenate((y_test, y_subj), axis=0)
-
-        if dFC_measure_name is None:
-            dFC_measure_name = dFC.measure.measure_name
-        else:
-            assert (
-                dFC_measure_name == dFC.measure.measure_name
-            ), "dFC measure is not consistent."
-
-    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
-    subj_label_train = np.array(subj_label_train)
-    subj_label_test = np.array(subj_label_test)
-
-    return (
-        X_train,
-        X_test,
-        y_train,
-        y_test,
-        subj_label_train,
-        subj_label_test,
-        dFC_measure_name,
-    )
-
-
-def logistic_regression_classify(X_train, y_train, X_test, y_test):
-    """
-    Logistic regression classification
-    """
-    # create a pipeline with a logistic regression model to find the best C
-    logistic_reg = make_pipeline(
-        StandardScaler(), LogisticRegression(penalty="l1", solver="saga")
-    )
-    # create a dictionary of all values we want to test for C
-    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
-    # use gridsearch to test all values for C
-    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
-    # fit model to data
-    lr_gscv.fit(X_train, y_train)
-
-    C = lr_gscv.best_params_["logisticregression__C"]
-
-    log_reg = make_pipeline(
-        StandardScaler(),
-        LogisticRegression(penalty="l1", C=C, solver="saga"),
-    ).fit(X_train, y_train)
-
-    RESULT = {
-        "log_reg_model": log_reg,
-        "log_reg_C": C,
-        "log_reg_train_score": log_reg.score(X_train, y_train),
-        "log_reg_test_score": log_reg.score(X_test, y_test),
-    }
-
-    return RESULT
-
-
-def KNN_classify(X_train, y_train, X_test, y_test):
-    """
-    KNN classification
-    """
-    # create a pipeline with a knn model to find the best n_neighbors
-    knn = make_pipeline(
-        StandardScaler(),
-        KNeighborsClassifier(),
-    )
-    # create a dictionary of all values we want to test for n_neighbors
-    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
-    # use gridsearch to test all values for n_neighbors
-    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
-    # fit model to data
-    knn_gscv.fit(X_train, y_train)
-
-    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
-
-    neigh = make_pipeline(
-        StandardScaler(),
-        KNeighborsClassifier(n_neighbors=n_neighbors),
-    ).fit(X_train, y_train)
-
-    RESULT = {
-        "KNN_cv_results": knn_gscv.cv_results_,
-        "KNN_model": neigh,
-        "KNN_train_score": neigh.score(X_train, y_train),
-        "KNN_test_score": neigh.score(X_test, y_test),
-    }
-
-    return RESULT
-
-
-def random_forest_classify(X_train, y_train, X_test, y_test):
-    """
-    Random Forest classification
-    """
-    # create a pipeline with a random forest model to find the best n_estimators
-    rf = make_pipeline(
-        StandardScaler(),
-        RandomForestClassifier(),
-    )
-    # create a dictionary of all values we want to test for n_estimators
-    param_grid = {
-        "randomforestclassifier__n_estimators": [10, 50, 100, 200],
-        "randomforestclassifier__max_depth": [None, 5, 10, 20, 30],
-    }
-    # use gridsearch to test all values for n_estimators
-    rf_gscv = GridSearchCV(rf, param_grid, cv=5)
-    # fit model to data
-    rf_gscv.fit(X_train, y_train)
-
-    n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"]
-    max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"]
-
-    rf = make_pipeline(
-        StandardScaler(),
-        RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
-    ).fit(X_train, y_train)
-
-    RESULT = {
-        "RF_cv_results": rf_gscv.cv_results_,
-        "RF_model": rf,
-        "RF_train_score": rf.score(X_train, y_train),
-        "RF_test_score": rf.score(X_test, y_test),
-    }
-
-    return RESULT
-
-
-def gradient_boosting_classify(X_train, y_train, X_test, y_test):
-    """
-    Gradient Boosting classification
-    """
-    # create a pipeline with a gradient boosting model to find the best n_estimators
-    gb = make_pipeline(
-        StandardScaler(),
-        GradientBoostingClassifier(),
-    )
-    # create a dictionary of all values we want to test for n_estimators
-    param_grid = {
-        "gradientboostingclassifier__n_estimators": [10, 50, 100, 200],
-        "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2],
-        "gradientboostingclassifier__max_depth": [3, 5, 10],
-    }
-    # use gridsearch to test all values for n_estimators
-    gb_gscv = GridSearchCV(gb, param_grid, cv=5)
-    # fit model to data
-    gb_gscv.fit(X_train, y_train)
-
-    n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"]
-    learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"]
-    max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"]
-
-    gb = make_pipeline(
-        StandardScaler(),
-        GradientBoostingClassifier(
-            n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate
-        ),
-    ).fit(X_train, y_train)
-
-    RESULT = {
-        "GB_cv_results": gb_gscv.cv_results_,
-        "GB_model": gb,
-        "GB_train_score": gb.score(X_train, y_train),
-        "GB_test_score": gb.score(X_test, y_test),
-    }
-
-    return RESULT
-
-
-def task_presence_classification(
-    task,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    run=None,
-    session=None,
-    dynamic_pred="no",
-    normalize_dFC=True,
-    train_test_ratio=0.8,
-):
-    """
-    perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting
-    for a given task and dFC method and run.
-    """
-    if run is None:
-        print(f"=============== {task} ===============")
-    else:
-        print(f"=============== {task} {run} ===============")
-
-    if task == "task-restingstate":
-        return
-
-    SUBJECTS = find_available_subjects(
-        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
-    )
-
-    # randomly select train_test_ratio of the subjects for training
-    # and rest for testing using numpy.random.choice
-    train_subjects = np.random.choice(
-        SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False
-    )
-    test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
-    print(
-        f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
-    )
-
-    X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = (
-        dFC_feature_extraction(
-            task=task,
-            train_subjects=train_subjects,
-            test_subjects=test_subjects,
-            dFC_id=dFC_id,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            run=run,
-            session=session,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-        )
-    )
-
-    # embed dFC features
-    X_train, X_test = embed_dFC_features(
-        train_subjects=train_subjects,
-        test_subjects=test_subjects,
-        X_train=X_train,
-        X_test=X_test,
-        y_train=y_train,
-        y_test=y_test,
-        subj_label_train=subj_label_train,
-        subj_label_test=subj_label_test,
-        embedding="LE",
-        n_components=30,
-        n_neighbors_LE=125,
-        LE_embedding_method="embed+procrustes",
-    )
-
-    # task presence classification
-
-    print("task presence classification ...")
-
-    # logistic regression
-    log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test)
-
-    # KNN
-    KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test)
-
-    # # Random Forest
-    # RF_RESULT = random_forest_classify(
-    #     X_train, y_train, X_test, y_test
-    # )
-
-    # # Gradient Boosting
-    # GBT_RESULT = gradient_boosting_classify(
-    #     X_train, y_train, X_test, y_test
-    # )
-
-    ML_RESULT = {}
-    for key in log_reg_RESULT:
-        ML_RESULT[key] = log_reg_RESULT[key]
-    for key in KNN_RESULT:
-        ML_RESULT[key] = KNN_RESULT[key]
-    # for key in RF_RESULT:
-    #     ML_RESULT[key] = RF_RESULT[key]
-    # for key in GBT_RESULT:
-    #     ML_RESULT[key] = GBT_RESULT[key]
-
-    # measure pred score on each subj
-
-    ML_scores = {
-        "subj_id": list(),
-        "group": list(),
-        "task": list(),
-        "run": list(),
-        "dFC method": list(),
-        "Logistic regression accuracy": list(),
-        "KNN accuracy": list(),
-        # "Random Forest accuracy": list(),
-        # "Gradient Boosting accuracy": list(),
-    }
-    log_reg = log_reg_RESULT["log_reg_model"]
-    KNN = KNN_RESULT["KNN_model"]
-    # RF = RF_RESULT["RF_model"]
-    # GBT = GBT_RESULT["GB_model"]
-
-    for subj in SUBJECTS:
-        ML_scores["subj_id"].append(subj)
-        if subj in train_subjects:
-            ML_scores["group"].append("train")
-            features = X_train[subj_label_train == subj, :]
-            target = y_train[subj_label_train == subj]
-        elif subj in test_subjects:
-            ML_scores["group"].append("test")
-            features = X_test[subj_label_test == subj, :]
-            target = y_test[subj_label_test == subj]
-
-        pred_lr = log_reg.predict(features)
-        pred_KNN = KNN.predict(features)
-        # pred_RF = RF.predict(features)
-        # pred_GBT = GBT.predict(features)
-
-        ML_scores["Logistic regression accuracy"].append(
-            balanced_accuracy_score(target, pred_lr)
-        )
-        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
-        # ML_scores["Random Forest accuracy"].append(
-        #     balanced_accuracy_score(target, pred_RF)
-        # )
-        # ML_scores["Gradient Boosting accuracy"].append(
-        #     balanced_accuracy_score(target, pred_GBT)
-        # )
-
-        ML_scores["task"].append(task)
-        ML_scores["run"].append(run)
-        ML_scores["dFC method"].append(measure_name)
-
-    return ML_RESULT, ML_scores
-
-
-def task_presence_clustering(
-    task,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    run=None,
-    session=None,
-    normalize_dFC=True,
-):
-    if run is None:
-        print(f"=============== {task} ===============")
-    else:
-        print(f"=============== {task} {run} ===============")
-
-    if task == "task-restingstate":
-        return
-
-    SUBJECTS = find_available_subjects(
-        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
-    )
-
-    print(f"Number of subjects: {len(SUBJECTS)}")
-
-    X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction(
-        task=task,
-        train_subjects=SUBJECTS,
-        test_subjects=[],
-        dFC_id=dFC_id,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        run=run,
-        session=session,
-        dynamic_pred="no",
-        normalize_dFC=normalize_dFC,
-    )
-
-    # embed dFC features
-    X, _ = embed_dFC_features(
-        train_subjects=SUBJECTS,
-        test_subjects=[],
-        X_train=X,
-        X_test=None,
-        y_train=y,
-        y_test=None,
-        subj_label_train=subj_label,
-        subj_label_test=None,
-        embedding="LE",
-        n_components=30,
-        n_neighbors_LE=125,
-        LE_embedding_method="embed+procrustes",
-    )
-
-    # clustering
-    # apply kmeans clustering to dFC features
-
-    n_clusters = 2  # corresponding to task and rest
-
-    scaler = StandardScaler()
-    X_normalized = scaler.fit_transform(X)
-    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-    labels_pred = kmeans.fit_predict(X_normalized)
-
-    # ARI score
-    print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
-
-    # # visualize clustering centroids
-    # centroids = kmeans.cluster_centers_
-    # centroids = pca.inverse_transform(centroids)
-    # centroids = scaler.inverse_transform(centroids)
-    # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-    # centroids_mat = dFC_vec2mat(centroids, n_regions)
-
-    clustering_RESULTS = {
-        "StandardScaler": scaler,
-        "kmeans": kmeans,
-        "ARI": adjusted_rand_score(y, labels_pred),
-        # "centroids": centroids_mat,
-    }
-
-    clustering_scores = {
-        "subj_id": list(),
-        "task": list(),
-        "run": list(),
-        "dFC method": list(),
-        "Kmeans ARI": list(),
-        "SI": list(),
-    }
-    for subj in SUBJECTS:
-        clustering_scores["subj_id"].append(subj)
-        features = X[subj_label == subj, :]
-        target = y[subj_label == subj]
-
-        features_normalized = scaler.transform(features)
-        pred_kmeans = kmeans.predict(features_normalized)
-
-        clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans))
-
-        # silhouette score in terms of separability of original labels, not the clustering labels
-        clustering_scores["SI"].append(silhouette_score(features, target))
-
-        clustering_scores["task"].append(task)
-        clustering_scores["run"].append(run)
-        clustering_scores["dFC method"].append(measure_name)
-
-    return clustering_RESULTS, clustering_scores
-
-
 def run_classification(
     dFC_id,
     TASKS,
@@ -1204,7 +131,7 @@ def run_clustering(
         np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores)
 
 
-def task_paradigm_clustering(
+def run_task_paradigm_clustering(
     dFC_id,
     TASKS,
     RUNS,
@@ -1398,6 +325,7 @@ def task_paradigm_clustering(
         RUNS=RUNS,
         SESSIONS=SESSIONS,
         roi_root=roi_root,
+        dFC_root=dFC_root,
         output_root=ML_root,
     )
     print("Task features extraction finished.")
@@ -1442,7 +370,7 @@ def task_paradigm_clustering(
 
     print(f"Task paradigm clustering started for dFC ID {dFC_id} ...")
     try:
-        task_paradigm_clustering(
+        run_task_paradigm_clustering(
             dFC_id=dFC_id,
             TASKS=TASKS,
             RUNS=RUNS,
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 21bc05b..36b1527 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -353,17 +353,17 @@ def calculate_subj_lvl_task_presence_characteristics(
         binary=True,
         binarizing_method="mean",
     )
-    relative_task_on = task_utils.relative_task_on(task_presence)
+    relative_task_on = task_utils.calc_relative_task_on(task_presence)
     # task duration
-    avg_task_duration, var_task_duration = task_utils.task_duration(
+    avg_task_duration, var_task_duration = task_utils.calc_task_duration(
         task_presence, task_data["TR_mri"]
     )
     # rest duration
-    avg_rest_duration, var_rest_duration = task_utils.rest_duration(
+    avg_rest_duration, var_rest_duration = task_utils.calc_rest_duration(
         task_presence, task_data["TR_mri"]
     )
     # freq of transitions
-    num_of_transitions, relative_transition_freq = task_utils.transition_freq(
+    num_of_transitions, relative_transition_freq = task_utils.calc_transition_freq(
         task_presence
     )
 

From 45e3fd5d82fe84ecff69e329f502e69f7d89211c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 8 Aug 2024 12:19:21 -0400
Subject: [PATCH 105/401] minor change

---
 pydfc/ml_utils.py | 111 +++++++++++++++++++++++++++++++++++++++++++
 task_dFC/ML.py    | 118 ++++------------------------------------------
 2 files changed, 120 insertions(+), 109 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 275753b..ad3ad79 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1113,3 +1113,114 @@ def task_presence_clustering(
         clustering_scores["dFC method"].append(measure_name)
 
     return clustering_RESULTS, clustering_scores
+
+
+def task_paradigm_clustering(
+    dFC_id,
+    TASKS,
+    RUNS,
+    session,
+    roi_root,
+    dFC_root,
+    normalize_dFC=True,
+):
+    # find SUBJECTS common to all tasks
+    for task_id, task in enumerate(TASKS):
+        if task_id == 0:
+            SUBJECTS = find_available_subjects(
+                dFC_root=dFC_root, task=task, dFC_id=dFC_id
+            )
+        else:
+            SUBJECTS = np.intersect1d(
+                SUBJECTS,
+                find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id),
+            )
+    print(f"Number of subjects: {len(SUBJECTS)}")
+
+    X = None
+    y = None
+    subj_label = None
+    measure_name = None
+    for task_id, task in enumerate(TASKS):
+        for run in RUNS[task]:
+            X_new, _, _, _, subj_label_new, _, measure_name_new = dFC_feature_extraction(
+                task=task,
+                train_subjects=SUBJECTS,
+                test_subjects=[],
+                dFC_id=dFC_id,
+                roi_root=roi_root,
+                dFC_root=dFC_root,
+                run=run,
+                session=session,
+                dynamic_pred="no",
+                normalize_dFC=normalize_dFC,
+            )
+
+            if measure_name is not None:
+                assert measure_name == measure_name_new, "dFC measure is not consistent."
+            else:
+                measure_name = measure_name_new
+
+            y_new = np.ones(X_new.shape[0]) * task_id
+            if X is None and y is None:
+                X = X_new
+                y = y_new
+                subj_label = subj_label_new
+            else:
+                X = np.concatenate((X, X_new), axis=0)
+                y = np.concatenate((y, y_new), axis=0)
+                subj_label = np.concatenate((subj_label, subj_label_new), axis=0)
+
+    assert X.shape[0] == y.shape[0], "Number of samples do not match."
+    assert X.shape[0] == subj_label.shape[0], "Number of samples do not match."
+
+    # rearrange the order of the samples so that the samples of the same subject are together
+    idx = np.argsort(subj_label)
+    X = X[idx, :]
+    y = y[idx]
+    subj_label = subj_label[idx]
+
+    # embed dFC features
+    X_embed, _ = embed_dFC_features(
+        train_subjects=SUBJECTS,
+        test_subjects=[],
+        X_train=X,
+        X_test=None,
+        y_train=y,
+        y_test=None,
+        subj_label_train=subj_label,
+        subj_label_test=None,
+        embedding="LE",
+        n_components=30,
+        n_neighbors_LE=125,
+        LE_embedding_method="embed+procrustes",
+    )
+
+    # clustering
+    # apply kmeans clustering to dFC features
+
+    n_clusters = len(TASKS)  # corresponding to task paradigms
+
+    scaler = StandardScaler()
+    X_normalized = scaler.fit_transform(X_embed)
+    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
+    labels_pred = kmeans.fit_predict(X_normalized)
+
+    # # visualize clustering centroids
+    # centroids = kmeans.cluster_centers_
+    # centroids = pca.inverse_transform(centroids)
+    # centroids = scaler.inverse_transform(centroids)
+    # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+    # centroids_mat = dFC_vec2mat(centroids, n_regions)
+
+    task_paradigm_clstr_RESULTS = {
+        "dFC_method": measure_name,
+        "StandardScaler": scaler,
+        "kmeans": kmeans,
+        "ARI": adjusted_rand_score(y, labels_pred),
+        "SI": silhouette_score(X_normalized, y),
+        # "centroids": centroids_mat,
+        "task_paradigms": TASKS,
+    }
+
+    return task_paradigm_clstr_RESULTS
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 1fb7bd1..d44e449 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -4,15 +4,10 @@
 import traceback
 
 import numpy as np
-from sklearn.cluster import KMeans
-from sklearn.metrics import adjusted_rand_score, silhouette_score
-from sklearn.preprocessing import StandardScaler
 
 from pydfc.ml_utils import (
-    dFC_feature_extraction,
-    embed_dFC_features,
     extract_task_features,
-    find_available_subjects,
+    task_paradigm_clustering,
     task_presence_classification,
     task_presence_clustering,
 )
@@ -142,112 +137,17 @@ def run_task_paradigm_clustering(
     normalize_dFC=True,
 ):
     for session in SESSIONS:
-        # find SUBJECTS common to all tasks
-        for task_id, task in enumerate(TASKS):
-            if task_id == 0:
-                SUBJECTS = find_available_subjects(
-                    dFC_root=dFC_root, task=task, dFC_id=dFC_id
-                )
-            else:
-                SUBJECTS = np.intersect1d(
-                    SUBJECTS,
-                    find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id),
-                )
-        print(f"Number of subjects: {len(SUBJECTS)}")
-
-        X = None
-        y = None
-        subj_label = None
-        measure_name = None
-        for task_id, task in enumerate(TASKS):
-            for run in RUNS[task]:
-                X_new, _, _, _, subj_label_new, _, measure_name_new = (
-                    dFC_feature_extraction(
-                        task=task,
-                        train_subjects=SUBJECTS,
-                        test_subjects=[],
-                        dFC_id=dFC_id,
-                        roi_root=roi_root,
-                        dFC_root=dFC_root,
-                        run=run,
-                        session=session,
-                        dynamic_pred="no",
-                        normalize_dFC=normalize_dFC,
-                    )
-                )
-
-                if measure_name is not None:
-                    assert (
-                        measure_name == measure_name_new
-                    ), "dFC measure is not consistent."
-                else:
-                    measure_name = measure_name_new
 
-                y_new = np.ones(X_new.shape[0]) * task_id
-                if X is None and y is None:
-                    X = X_new
-                    y = y_new
-                    subj_label = subj_label_new
-                else:
-                    X = np.concatenate((X, X_new), axis=0)
-                    y = np.concatenate((y, y_new), axis=0)
-                    subj_label = np.concatenate((subj_label, subj_label_new), axis=0)
-
-        assert X.shape[0] == y.shape[0], "Number of samples do not match."
-        assert X.shape[0] == subj_label.shape[0], "Number of samples do not match."
-
-        # rearrange the order of the samples so that the samples of the same subject are together
-        idx = np.argsort(subj_label)
-        X = X[idx, :]
-        y = y[idx]
-        subj_label = subj_label[idx]
-
-        # embed dFC features
-        X, _ = embed_dFC_features(
-            train_subjects=SUBJECTS,
-            test_subjects=[],
-            X_train=X,
-            X_test=None,
-            y_train=y,
-            y_test=None,
-            subj_label_train=subj_label,
-            subj_label_test=None,
-            embedding="LE",
-            n_components=30,
-            n_neighbors_LE=125,
-            LE_embedding_method="embed+procrustes",
+        task_paradigm_clstr_RESULTS = task_paradigm_clustering(
+            dFC_id=dFC_id,
+            TASKS=TASKS,
+            RUNS=RUNS,
+            session=session,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            normalize_dFC=normalize_dFC,
         )
 
-        # clustering
-        # apply kmeans clustering to dFC features
-
-        n_clusters = len(TASKS)  # corresponding to task paradigms
-
-        scaler = StandardScaler()
-        X_normalized = scaler.fit_transform(X)
-        kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-        labels_pred = kmeans.fit_predict(X_normalized)
-
-        # ARI score
-        print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
-
-        # # visualize clustering centroids
-        # centroids = kmeans.cluster_centers_
-        # centroids = pca.inverse_transform(centroids)
-        # centroids = scaler.inverse_transform(centroids)
-        # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-        # centroids_mat = dFC_vec2mat(centroids, n_regions)
-
-        task_paradigm_clstr_RESULTS = {
-            "dFC_method": measure_name,
-            "StandardScaler": scaler,
-            "kmeans": kmeans,
-            "ARI": adjusted_rand_score(y, labels_pred),
-            "SI": silhouette_score(X_normalized, y),
-            # "centroids": centroids_mat,
-            "task_paradigms": TASKS,
-        }
-
         if session is None:
             folder = f"{output_root}"
         else:

From 34e40e152744dea485b92fe3f91ca1b0f72be52a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 13 Aug 2024 22:37:36 -0400
Subject: [PATCH 106/401] add intrinsic dim estimate

---
 pydfc/ml_utils.py | 162 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 154 insertions(+), 8 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index ad3ad79..72d35d5 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -9,14 +9,15 @@
 
 import numpy as np
 from scipy.spatial import procrustes
+from scipy.stats import zscore
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.manifold import SpectralEmbedding
 from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
-from sklearn.neighbors import KNeighborsClassifier, kneighbors_graph
+from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 
@@ -449,6 +450,132 @@ def generalized_procrustes(X_list):
     return mean_X
 
 
+def twonn(X, discard_ratio=0.1):
+    """
+    Calculates intrinsic dimension of the provided data points with the TWO-NN algorithm.
+
+    -----------
+    Parameters:
+
+    X : 2d array-like
+        (n_samples, n_features)
+    discard_fraction : float between 0 and 1
+        Fraction of largest distances to discard (heuristic from the paper)
+
+    Returns:
+
+    d : float
+        Intrinsic dimension of the dataset according to TWO-NN.
+    """
+
+    num_samples = X.shape[0]
+
+    NN = NearestNeighbors(n_neighbors=30)
+    NN.fit(X)
+    distances, _ = NN.kneighbors(return_distance=True)
+
+    mu = np.zeros((num_samples))
+    for i in range(num_samples):
+        # find the two nearest neighbors that have different distances and the distance is not 0
+        r1, r2 = None, None
+        for j in range(distances.shape[1]):
+            if distances[i, j] != 0:
+                if r1 is None:
+                    r1 = distances[i, j]
+                elif distances[i, j] != r1:
+                    r2 = distances[i, j]
+                    break
+        if r1 is not None and r2 is not None:
+            mu[i] = r2 / r1
+        else:
+            mu[i] = np.nan
+
+    # discard NaN values
+    mu = mu[~np.isnan(mu)]
+    # large distances will cause the estimation to be biased, discard them
+    mu = mu[np.argsort(mu)[: int((1 - discard_ratio) * num_samples)]]
+
+    # CDF
+    CDF = np.arange(1, 1 + len(mu)) / num_samples
+    # Fit the formula: log(1 - CDF) = d * log(mu)
+    lr = LinearRegression(fit_intercept=False)
+    lr.fit(np.log(mu).reshape(-1, 1), -np.log(1 - CDF).reshape(-1, 1))
+    d = lr.coef_[0][0]
+
+    return d
+
+
+def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125):
+    """
+    Find the intrinsic dimension of the data based on the silhouette score.
+    """
+
+    SI_score = {}
+    for n_components in search_range:
+        X_train_embed, _ = embed_dFC_features(
+            train_subjects=["subj"],
+            test_subjects=[],
+            X_train=X,
+            X_test=None,
+            y_train=y,
+            y_test=None,
+            subj_label_train=np.array(["subj"] * len(y)),
+            subj_label_test=None,
+            embedding="LE",
+            n_components=n_components,
+            n_neighbors_LE=n_neighbors_LE,
+            LE_embedding_method="embed+procrustes",
+        )
+
+        SI_score[n_components] = silhouette_score(X_train_embed, y)
+
+    # find the intrinsic dimension based on the silhouette score
+    intrinsic_dim = max(SI_score, key=SI_score.get)
+
+    return intrinsic_dim
+
+
+def find_intrinsic_dim(
+    X,
+    y,
+    subj_label,
+    subjects,
+    method="SI",
+    n_neighbors_LE=125,
+    search_range_SI=range(2, 50, 5),
+):
+    """
+    Find the number of components to use for embedding the data using LE.
+    Find the average intrinsic dimension across all subjects.
+
+    method: "SI" or "twonn"
+
+    Returns:
+    intrinsic_dim: number of components to use for embedding
+    """
+    if method == "SI":
+        intrinsic_dim_all = list()
+        for subject in subjects:
+            X_subj = X[subj_label == subject, :]
+            y_subj = y[subj_label == subject]
+            intrinsic_dim_all.append(
+                SI_ID(
+                    X_subj,
+                    y_subj,
+                    search_range=search_range_SI,
+                    n_neighbors_LE=n_neighbors_LE,
+                )
+            )
+        intrinsic_dim = int(np.mean(intrinsic_dim_all))
+    elif method == "twonn":
+        intrinsic_dim_all = list()
+        for subject in subjects:
+            X_subj = X[subj_label == subject, :]
+            intrinsic_dim_all.append(twonn(X_subj, discard_ratio=0.1))
+        intrinsic_dim = int(np.mean(intrinsic_dim_all))
+    return intrinsic_dim
+
+
 def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
     """
     Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space.
@@ -653,9 +780,9 @@ def embed_dFC_features(
     subj_label_train,
     subj_label_test,
     embedding="PCA",
-    n_components=30,
+    n_components="auto",
     n_neighbors_LE=125,
-    LE_embedding_method="concat+embed",
+    LE_embedding_method="embed+procrustes",
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
@@ -666,7 +793,11 @@ def embed_dFC_features(
     LE_embedding_method: "concat+embed" or "embed+procrustes"
     """
     if embedding == "PCA":
-        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
+        # if n_components is not specified, use 95% of the variance
+        if n_components == "auto":
+            pca = PCA(n_components=0.95, svd_solver="full", whiten=False)
+        else:
+            pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
         pca.fit(X_train)
         X_train_embed = pca.transform(X_train)
         if X_test is not None:
@@ -674,6 +805,18 @@ def embed_dFC_features(
         else:
             X_test_embed = None
     elif embedding == "LE":
+        # if n_components is not specified, find the intrinsic dimension of the data using training set and based on the silhouette score
+        if n_components == "auto":
+            n_components = find_intrinsic_dim(
+                X=X_train,
+                y=y_train,
+                subj_label=subj_label_train,
+                subjects=train_subjects,
+                method="SI",
+                n_neighbors_LE=n_neighbors_LE,
+                search_range_SI=range(2, 50, 5),
+            )
+
         if LE_embedding_method == "embed+procrustes":
             X_train_embed, X_test_embed = LE_embed_procustes(
                 X_train=X_train,
@@ -918,7 +1061,7 @@ def task_presence_classification(
         subj_label_train=subj_label_train,
         subj_label_test=subj_label_test,
         embedding="LE",
-        n_components=30,
+        n_components="auto",
         n_neighbors_LE=125,
         LE_embedding_method="embed+procrustes",
     )
@@ -1055,7 +1198,7 @@ def task_presence_clustering(
         subj_label_train=subj_label,
         subj_label_test=None,
         embedding="LE",
-        n_components=30,
+        n_components="auto",
         n_neighbors_LE=125,
         LE_embedding_method="embed+procrustes",
     )
@@ -1156,6 +1299,9 @@ def task_paradigm_clustering(
                 normalize_dFC=normalize_dFC,
             )
 
+            # normalize the features
+            X_new = zscore(X_new, axis=0)
+
             if measure_name is not None:
                 assert measure_name == measure_name_new, "dFC measure is not consistent."
             else:
@@ -1191,7 +1337,7 @@ def task_paradigm_clustering(
         subj_label_train=subj_label,
         subj_label_test=None,
         embedding="LE",
-        n_components=30,
+        n_components="auto",
         n_neighbors_LE=125,
         LE_embedding_method="embed+procrustes",
     )

From 013d01d0bad64b88e5d48238cda7daa33449b243 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 20 Aug 2024 13:36:18 -0400
Subject: [PATCH 107/401] minor change

---
 pydfc/ml_utils.py           | 30 +++++++++++++++++++++++-------
 task_dFC/generate_report.py | 12 ++++++++++--
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 72d35d5..5c17002 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -6,6 +6,7 @@
 @author: Mohammad Torabi
 """
 import os
+import warnings
 
 import numpy as np
 from scipy.spatial import procrustes
@@ -579,10 +580,23 @@ def find_intrinsic_dim(
 def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
     """
     Apply Laplacian Eigenmaps (LE) to transform data into a lower dimensional space.
+
+    if n_neighbors >= n_samples, n_neighbors will be changed to the lower limit n_neighbors
     """
+    min_n_neighbors = 70
+
+    if n_neighbors >= X.shape[0]:
+        n_neighbors_to_be_used = min_n_neighbors
+        # raise a warning
+        warnings.warn(
+            "n_neighbors is larger than the number of samples. n_neighbors is set to the minimum value of 70."
+        )
+    else:
+        n_neighbors_to_be_used = n_neighbors
+
     affinity_matrix = kneighbors_graph(
         X,
-        n_neighbors=n_neighbors,
+        n_neighbors=n_neighbors_to_be_used,
         mode="connectivity",
         include_self=False,
         metric=distance_metric,
@@ -590,7 +604,9 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
     affinity_matrix = affinity_matrix.toarray()
     affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2)
     LE = SpectralEmbedding(
-        n_components=n_components, affinity="precomputed", n_neighbors=n_neighbors
+        n_components=n_components,
+        affinity="precomputed",
+        n_neighbors=n_neighbors_to_be_used,
     )
     X_embed = LE.fit_transform(X=affinity_matrix)
     return X_embed
@@ -622,7 +638,7 @@ def LE_embed_procustes(
             X_subj_embed = LE_transform(
                 X=X_subj,
                 n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                n_neighbors=n_neighbors_LE,
                 distance_metric="correlation",
             )
             SI = silhouette_score(X_subj_embed, y_subj)
@@ -670,7 +686,7 @@ def LE_embed_procustes(
             X_subj_embed = LE_transform(
                 X=X_subj,
                 n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                n_neighbors=n_neighbors_LE,
                 distance_metric="correlation",
             )
             # procrustes transformation
@@ -699,7 +715,7 @@ def LE_embed_procustes(
             X_subj_embed = LE_transform(
                 X=X_subj,
                 n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                n_neighbors=n_neighbors_LE,
                 distance_metric="correlation",
             )
             embed_dict[subject] = X_subj_embed
@@ -753,7 +769,7 @@ def LE_embed_procustes(
             X_subj_embed = LE_transform(
                 X=X_subj,
                 n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_subj.shape[0]),
+                n_neighbors=n_neighbors_LE,
                 distance_metric="correlation",
             )
             mean_X_train_new_size = precheck_for_procruste(mean_X_train, X_subj_embed)
@@ -840,7 +856,7 @@ def embed_dFC_features(
             X_concat_embed = LE_transform(
                 X=X_concat,
                 n_components=n_components,
-                n_neighbors=min(n_neighbors_LE, X_concat.shape[0]),
+                n_neighbors=n_neighbors_LE,
                 distance_metric="correlation",
             )
             X_train_embed = X_concat_embed[: X_train.shape[0], :]
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 36b1527..4c99d88 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -507,6 +507,8 @@ def plot_ML_results(
         capsize=0.1,
     )
     g.axhline(0.5, color="r", linestyle="--")
+    # set the y-axis upper limit to 1, but not set the lower limit
+    g.set(ylim=(None, 1))
     if show_title:
         g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
 
@@ -603,6 +605,8 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         capsize=0.1,
     )
     g.axhline(0.0, color="r", linestyle="--")
+    # set the y-axis upper limit to 1, but not set the lower limit
+    g.set(ylim=(None, 1))
     if show_title:
         g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
 
@@ -645,7 +649,8 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         dodge=True,
         capsize=0.1,
     )
-
+    # set the y-axis upper limit to 1, but not set the lower limit
+    g.set(ylim=(None, 1))
     if show_title:
         g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
     # save the figure
@@ -741,6 +746,8 @@ def plot_paradigm_clustering_score(
         capsize=0.1,
     )
     g.axhline(0.0, color="r", linestyle="--")
+    # set the y-axis upper limit to 1, but not set the lower limit
+    g.set(ylim=(None, 1))
     if show_title:
         g.set_title(
             "Task Paradigm Clustering Performance",
@@ -776,7 +783,8 @@ def plot_paradigm_clustering_score(
         dodge=True,
         capsize=0.1,
     )
-
+    # set the y-axis upper limit to 1, but not set the lower limit
+    g.set(ylim=(None, 1))
     if show_title:
         g.set_title(
             "Task Paradigm Clustering Performance",

From 1504f86acd00d3c1dfe8493881bb7a6f2342e33a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 29 Aug 2024 21:33:09 -0400
Subject: [PATCH 108/401] change generalized procruste

---
 .flake8           |  1 +
 pydfc/ml_utils.py | 73 +++++++++++++++++++++++++++++------------------
 2 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/.flake8 b/.flake8
index 7f73516..b57c737 100644
--- a/.flake8
+++ b/.flake8
@@ -25,6 +25,7 @@ ignore =
     E731,
     E713,
     E714,
+    E722,
     E741,
     F403,
     F405,
diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 5c17002..de8a37e 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -412,43 +412,61 @@ def generalized_procrustes(X_list):
 
     returns the mean X to be used as the reference for procrustes transformation
     """
-    # initialize Procrustes distance
-    current_distance = 0
+    for iter_num in range(100):
 
-    # initialize a mean X
-    mean_X = np.array(X_list[0])
+        try:
+            # initialize Procrustes distance
+            current_distance = 0
 
-    num_X = len(X_list)
+            num_X = len(X_list)
 
-    # create array for new Xs, add
-    new_Xs = np.zeros(np.array(X_list).shape)
+            # initialize a mean X by randomly selecting
+            # one of the Xs using np.random.choice
+            mean_X = X_list[np.random.choice(num_X)]
 
-    while True:
-        # add the mean X as first element of array
-        new_Xs[0] = mean_X
+            # create array for new Xs, add
+            new_Xs = np.zeros(np.array(X_list).shape)
 
-        # superimpose all shapes to current mean
-        for i in range(1, num_X):
-            _, new_X, _ = procrustes(mean_X, X_list[i])
-            new_Xs[i] = new_X
+            counter = 0
+            flag = False
+            while True:
+                counter += 1
+                if counter > 1e6:
+                    # if the algorithm does not converge, break the cycle
+                    # to avoid infinite loop
+                    flag = True
+                    break
+
+                # add the mean X as first element of array
+                new_Xs[0] = mean_X
+
+                # superimpose all shapes to current mean
+                for i in range(1, num_X):
+                    _, new_X, _ = procrustes(mean_X, X_list[i])
+                    new_Xs[i] = new_X
 
-        # calculate new mean
-        new_mean = np.mean(new_Xs, axis=0)
+                # calculate new mean
+                new_mean = np.mean(new_Xs, axis=0)
 
-        _, _, new_distance = procrustes(new_mean, mean_X)
+                _, _, new_distance = procrustes(new_mean, mean_X)
+
+                # if the distance did not change, break the cycle
+                if np.abs(new_distance - current_distance) < 1e-6:
+                    break
 
-        # if the distance did not change, break the cycle
-        if np.abs(new_distance - current_distance) < 1e-6:
-            break
+                # align the new_mean to old mean
+                _, new_mean, _ = procrustes(mean_X, new_mean)
 
-        # align the new_mean to old mean
-        _, new_mean, _ = procrustes(mean_X, new_mean)
+                # update mean and distance
+                mean_X = new_mean
+                current_distance = new_distance
 
-        # update mean and distance
-        mean_X = new_mean
-        current_distance = new_distance
+            if not flag:
+                return mean_X
+        except:
+            continue
 
-    return mean_X
+    raise ValueError("Generalized Procrustes Analysis did not converge.")
 
 
 def twonn(X, discard_ratio=0.1):
@@ -477,7 +495,8 @@ def twonn(X, discard_ratio=0.1):
 
     mu = np.zeros((num_samples))
     for i in range(num_samples):
-        # find the two nearest neighbors that have different distances and the distance is not 0
+        # find the two nearest neighbors that have
+        # different distances and the distance is not 0
         r1, r2 = None, None
         for j in range(distances.shape[1]):
             if distances[i, j] != 0:

From 55cb8decc52ad02793dcb39df444ad7d91debb1e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 10 Sep 2024 14:51:01 -0400
Subject: [PATCH 109/401] fix bugs

---
 pydfc/ml_utils.py   | 129 +++++++++++++++++++++-----------------------
 pydfc/task_utils.py |   6 +++
 task_dFC/ML.py      |  75 +++++++++++++++++++++-----
 3 files changed, 127 insertions(+), 83 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index de8a37e..857353f 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -128,81 +128,72 @@ def load_task_data(roi_root, subj, task, run=None, session=None):
 ################################# Feature Extraction Functions ####################################
 
 
-def extract_task_features(TASKS, RUNS, SESSIONS, roi_root, dFC_root, output_root):
+def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root):
     """
     Extract task features from the event data."""
-    for session in SESSIONS:
-        task_features = {
-            "task": list(),
-            "run": list(),
-            "relative_task_on": list(),
-            "avg_task_duration": list(),
-            "var_task_duration": list(),
-            "avg_rest_duration": list(),
-            "var_rest_duration": list(),
-            "num_of_transitions": list(),
-            "relative_transition_freq": list(),
-        }
-        for task_id, task in enumerate(TASKS):
-
-            if task == "task-restingstate":
-                continue
-
-            for run in RUNS[task]:
-
-                SUBJECTS = find_available_subjects(
-                    dFC_root=dFC_root, task=task, run=run, session=session
-                )
+    task_features = {
+        "task": list(),
+        "run": list(),
+        "relative_task_on": list(),
+        "avg_task_duration": list(),
+        "var_task_duration": list(),
+        "avg_rest_duration": list(),
+        "var_rest_duration": list(),
+        "num_of_transitions": list(),
+        "relative_transition_freq": list(),
+    }
+    for task_id, task in enumerate(TASKS):
 
-                for subj in SUBJECTS:
-                    # event data
-                    task_data = load_task_data(
-                        roi_root=roi_root, subj=subj, task=task, run=run, session=session
-                    )
-                    Fs_task = task_data["Fs_task"]
-                    TR_task = 1 / Fs_task
-
-                    task_presence = extract_task_presence(
-                        event_labels=task_data["event_labels"],
-                        TR_task=TR_task,
-                        TR_mri=task_data["TR_mri"],
-                        binary=True,
-                        binarizing_method="mean",
-                    )
+        if task == "task-restingstate":
+            continue
 
-                    relative_task_on = calc_relative_task_on(task_presence)
-                    # task duration
-                    avg_task_duration, var_task_duration = calc_task_duration(
-                        task_presence, task_data["TR_mri"]
-                    )
-                    # rest duration
-                    avg_rest_duration, var_rest_duration = calc_rest_duration(
-                        task_presence, task_data["TR_mri"]
-                    )
-                    # freq of transitions
-                    num_of_transitions, relative_transition_freq = calc_transition_freq(
-                        task_presence
-                    )
+        for run in RUNS[task]:
 
-                    task_features["task"].append(task)
-                    task_features["run"].append(run)
-                    task_features["relative_task_on"].append(relative_task_on)
-                    task_features["avg_task_duration"].append(avg_task_duration)
-                    task_features["var_task_duration"].append(var_task_duration)
-                    task_features["avg_rest_duration"].append(avg_rest_duration)
-                    task_features["var_rest_duration"].append(var_rest_duration)
-                    task_features["num_of_transitions"].append(num_of_transitions)
-                    task_features["relative_transition_freq"].append(
-                        relative_transition_freq
-                    )
+            SUBJECTS = find_available_subjects(
+                dFC_root=dFC_root, task=task, run=run, session=session
+            )
 
-        if session is None:
-            folder = f"{output_root}"
-        else:
-            folder = f"{output_root}/{session}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-        np.save(f"{folder}/task_features.npy", task_features)
+            for subj in SUBJECTS:
+                # event data
+                task_data = load_task_data(
+                    roi_root=roi_root, subj=subj, task=task, run=run, session=session
+                )
+                Fs_task = task_data["Fs_task"]
+                TR_task = 1 / Fs_task
+
+                task_presence = extract_task_presence(
+                    event_labels=task_data["event_labels"],
+                    TR_task=TR_task,
+                    TR_mri=task_data["TR_mri"],
+                    binary=True,
+                    binarizing_method="mean",
+                )
+
+                relative_task_on = calc_relative_task_on(task_presence)
+                # task duration
+                avg_task_duration, var_task_duration = calc_task_duration(
+                    task_presence, task_data["TR_mri"]
+                )
+                # rest duration
+                avg_rest_duration, var_rest_duration = calc_rest_duration(
+                    task_presence, task_data["TR_mri"]
+                )
+                # freq of transitions
+                num_of_transitions, relative_transition_freq = calc_transition_freq(
+                    task_presence
+                )
+
+                task_features["task"].append(task)
+                task_features["run"].append(run)
+                task_features["relative_task_on"].append(relative_task_on)
+                task_features["avg_task_duration"].append(avg_task_duration)
+                task_features["var_task_duration"].append(var_task_duration)
+                task_features["avg_rest_duration"].append(avg_rest_duration)
+                task_features["var_rest_duration"].append(var_rest_duration)
+                task_features["num_of_transitions"].append(num_of_transitions)
+                task_features["relative_transition_freq"].append(relative_transition_freq)
+
+    return task_features
 
 
 def dFC_feature_extraction_subj_lvl(
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 8a11cbf..4fa8f0d 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -336,6 +336,9 @@ def calc_task_duration(task_presence, TR_mri):
             task_durations.append((end - start) * TR_mri)
             start = None
     task_durations = np.array(task_durations)
+    # find mean and variance of task durations with division error handling
+    if len(task_durations) == 0:
+        return 0, 0
     return np.mean(task_durations), np.var(task_durations)
 
 
@@ -358,6 +361,9 @@ def calc_rest_duration(task_presence, TR_mri):
         end = len(task_presence)
         rest_durations.append((end - start) * TR_mri)
     rest_durations = np.array(rest_durations)
+    # find mean and variance of rest durations with division error handling
+    if len(rest_durations) == 0:
+        return 0, 0
     return np.mean(rest_durations), np.var(rest_durations)
 
 
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index d44e449..a792130 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -15,6 +15,39 @@
 #######################################################################################
 
 
+def run_task_features_extraction(
+    TASKS,
+    RUNS,
+    SESSIONS,
+    roi_root,
+    dFC_root,
+    output_root,
+):
+    for session in SESSIONS:
+        task_features = extract_task_features(
+            TASKS=TASKS,
+            RUNS=RUNS,
+            session=session,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+        )
+
+        if session is None:
+            folder = f"{output_root}"
+        else:
+            folder = f"{output_root}/{session}"
+        try:
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+        except OSError as err:
+            print(err)
+        try:
+            if not os.path.exists(f"{folder}/task_features.npy"):
+                np.save(f"{folder}/task_features.npy", task_features)
+        except OSError as err:
+            print(err)
+
+
 def run_classification(
     dFC_id,
     TASKS,
@@ -66,8 +99,11 @@ def run_classification(
             folder = f"{output_root}"
         else:
             folder = f"{output_root}/{session}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
+        try:
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+        except OSError as err:
+            print(err)
         np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
 
         np.save(f"{folder}/ML_scores_classify_{dFC_id}.npy", ML_scores)
@@ -119,8 +155,11 @@ def run_clustering(
             folder = f"{output_root}"
         else:
             folder = f"{output_root}/{session}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
+        try:
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+        except OSError as err:
+            print(err)
         np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS)
 
         np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores)
@@ -152,8 +191,11 @@ def run_task_paradigm_clustering(
             folder = f"{output_root}"
         else:
             folder = f"{output_root}/{session}"
-        if not os.path.exists(folder):
-            os.makedirs(folder)
+        try:
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+        except OSError as err:
+            print(err)
 
         np.save(
             f"{folder}/task_paradigm_clstr_RESULTS_{dFC_id}.npy",
@@ -220,14 +262,19 @@ def run_task_paradigm_clustering(
     else:
         ML_root = dataset_info["ML_root"]
 
-    extract_task_features(
-        TASKS=TASKS,
-        RUNS=RUNS,
-        SESSIONS=SESSIONS,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        output_root=ML_root,
-    )
+    # The task feature extraction will be executed multiple times in parallel redundantly
+    try:
+        run_task_features_extraction(
+            TASKS=TASKS,
+            RUNS=RUNS,
+            SESSIONS=SESSIONS,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            output_root=ML_root,
+        )
+    except Exception as e:
+        print(f"Error in task features extraction: {e}")
+        traceback.print_exc()
     print("Task features extraction finished.")
 
     job_id = int(os.getenv("SGE_TASK_ID"))

From b5be00bd455e3acac2ce88172b6988134baaa5b7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 10 Sep 2024 15:06:59 -0400
Subject: [PATCH 110/401] handle common bold.json

---
 task_dFC/nifti_to_roi_signal.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 3953865..56880e4 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -65,6 +65,26 @@ def run_roi_signal_extraction(
             nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}"
             task_events_root = f"{main_root}/bids/{subj}/{session}/func"
         info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}"
+
+        # in some cases the info file is common for all subjects and can be found in f"{main_root}/bids"
+        if not os.path.exists(info_file):
+            ALL_COMMON_FILES = os.listdir(f"{main_root}/bids/")
+            ALL_COMMON_FILES = [
+                file_i
+                for file_i in ALL_COMMON_FILES
+                if (f"{task}_" in file_i) and ("_bold.json" in file_i)
+            ]
+            if len(ALL_COMMON_FILES) == 1:
+                info_file = f"{main_root}/bids/{ALL_COMMON_FILES[0]}"
+        if not os.path.exists(info_file):
+            # if the info file is not found, exclude the subject
+            if run is None:
+                print(f"bold.json info file not found for {subj} {session_str} {task}")
+            else:
+                print(
+                    f"bold.json info file not found for {subj} {session_str} {task} {run}"
+                )
+            return
         ################################# LOAD JSON INFO #########################
         # Opening JSON file as a dictionary
         f = open(info_file)

From 4a6405bd5d1429bbf42bab19781fffac5465e537 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Sep 2024 11:50:51 -0400
Subject: [PATCH 111/401] add slurm run scripts

---
 pydfc/ml_utils.py                             |  1 +
 .../dataset_info.json                         |  0
 .../global_configs.json                       |  0
 .../methods_config.json                       |  0
 .../run_FCS.sh                                |  0
 .../run_ML.sh                                 |  0
 .../run_dFC.sh                                |  0
 .../run_fmriprep.sh                           |  0
 .../run_nifti_to_roi.sh                       |  0
 .../run_report.sh                             |  0
 task_dFC/run_scripts_slurm/dataset_info.json  | 22 ++++++++
 .../run_scripts_slurm/global_configs.json     | 54 +++++++++++++++++++
 .../run_scripts_slurm/methods_config.json     | 35 ++++++++++++
 task_dFC/run_scripts_slurm/run_FCS.sh         | 18 +++++++
 task_dFC/run_scripts_slurm/run_ML.sh          | 16 ++++++
 task_dFC/run_scripts_slurm/run_dFC.sh         | 23 ++++++++
 task_dFC/run_scripts_slurm/run_fmriprep.sh    | 24 +++++++++
 .../run_scripts_slurm/run_nifti_to_roi.sh     | 23 ++++++++
 task_dFC/run_scripts_slurm/run_report.sh      | 18 +++++++
 19 files changed, 234 insertions(+)
 rename task_dFC/{run_scripts => run_scripts_sge}/dataset_info.json (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/global_configs.json (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/methods_config.json (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/run_FCS.sh (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/run_ML.sh (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/run_dFC.sh (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/run_fmriprep.sh (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/run_nifti_to_roi.sh (100%)
 rename task_dFC/{run_scripts => run_scripts_sge}/run_report.sh (100%)
 create mode 100644 task_dFC/run_scripts_slurm/dataset_info.json
 create mode 100644 task_dFC/run_scripts_slurm/global_configs.json
 create mode 100644 task_dFC/run_scripts_slurm/methods_config.json
 create mode 100644 task_dFC/run_scripts_slurm/run_FCS.sh
 create mode 100644 task_dFC/run_scripts_slurm/run_ML.sh
 create mode 100644 task_dFC/run_scripts_slurm/run_dFC.sh
 create mode 100644 task_dFC/run_scripts_slurm/run_fmriprep.sh
 create mode 100644 task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
 create mode 100644 task_dFC/run_scripts_slurm/run_report.sh

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 857353f..693e881 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -617,6 +617,7 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
         n_components=n_components,
         affinity="precomputed",
         n_neighbors=n_neighbors_to_be_used,
+        # eigen_solver="lobpcg",
     )
     X_embed = LE.fit_transform(X=affinity_matrix)
     return X_embed
diff --git a/task_dFC/run_scripts/dataset_info.json b/task_dFC/run_scripts_sge/dataset_info.json
similarity index 100%
rename from task_dFC/run_scripts/dataset_info.json
rename to task_dFC/run_scripts_sge/dataset_info.json
diff --git a/task_dFC/run_scripts/global_configs.json b/task_dFC/run_scripts_sge/global_configs.json
similarity index 100%
rename from task_dFC/run_scripts/global_configs.json
rename to task_dFC/run_scripts_sge/global_configs.json
diff --git a/task_dFC/run_scripts/methods_config.json b/task_dFC/run_scripts_sge/methods_config.json
similarity index 100%
rename from task_dFC/run_scripts/methods_config.json
rename to task_dFC/run_scripts_sge/methods_config.json
diff --git a/task_dFC/run_scripts/run_FCS.sh b/task_dFC/run_scripts_sge/run_FCS.sh
similarity index 100%
rename from task_dFC/run_scripts/run_FCS.sh
rename to task_dFC/run_scripts_sge/run_FCS.sh
diff --git a/task_dFC/run_scripts/run_ML.sh b/task_dFC/run_scripts_sge/run_ML.sh
similarity index 100%
rename from task_dFC/run_scripts/run_ML.sh
rename to task_dFC/run_scripts_sge/run_ML.sh
diff --git a/task_dFC/run_scripts/run_dFC.sh b/task_dFC/run_scripts_sge/run_dFC.sh
similarity index 100%
rename from task_dFC/run_scripts/run_dFC.sh
rename to task_dFC/run_scripts_sge/run_dFC.sh
diff --git a/task_dFC/run_scripts/run_fmriprep.sh b/task_dFC/run_scripts_sge/run_fmriprep.sh
similarity index 100%
rename from task_dFC/run_scripts/run_fmriprep.sh
rename to task_dFC/run_scripts_sge/run_fmriprep.sh
diff --git a/task_dFC/run_scripts/run_nifti_to_roi.sh b/task_dFC/run_scripts_sge/run_nifti_to_roi.sh
similarity index 100%
rename from task_dFC/run_scripts/run_nifti_to_roi.sh
rename to task_dFC/run_scripts_sge/run_nifti_to_roi.sh
diff --git a/task_dFC/run_scripts/run_report.sh b/task_dFC/run_scripts_sge/run_report.sh
similarity index 100%
rename from task_dFC/run_scripts/run_report.sh
rename to task_dFC/run_scripts_sge/run_report.sh
diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json
new file mode 100644
index 0000000..16d775e
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/dataset_info.json
@@ -0,0 +1,22 @@
+{
+	"dataset" : "",
+	"main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}",
+	"fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output",
+	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
+	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
+	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
+	"ML_root" : "{main_root}/derivatives/ML",
+	"reports_root" : "{main_root}/derivatives/reports",
+	"trial_type_label" : "trial_type",
+	"rest_labels" : ["rest", "Rest"],
+	"bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz",
+	"SESSIONS" : [
+		"ses-1"
+	],
+	"TASKS" : [
+		"task-A"
+	],
+	"RUNS" : {
+    		"task-A": ["run-01", "run-02", "run-03", "run-04", "run-05", "run-06"]
+	}
+}
diff --git a/task_dFC/run_scripts_slurm/global_configs.json b/task_dFC/run_scripts_slurm/global_configs.json
new file mode 100644
index 0000000..44a524c
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/global_configs.json
@@ -0,0 +1,54 @@
+{
+    "DATASET_NAME": "",
+    "DATASET_ROOT": "/home/mt00/scratch/DATA/task-based/openneuro//",
+
+    "CONTAINER_STORE": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/",
+
+    "SINGULARITY_PATH": "singularity",
+
+    "TEMPLATEFLOW_DIR": "/home/mt00/projects/def-jbpoline/templateflow",
+
+    "SESSIONS": [],
+    "VISITS": [],
+
+    "BIDS": {
+        "heudiconv": {
+            "VERSION": "0.11.6",
+            "CONTAINER": "heudiconv_{}.sif",
+            "URL": ""
+        },
+        "validator":{
+            "CONTAINER": "bids_validator.sif",
+            "URL": ""
+
+        }
+    },
+
+    "PROC_PIPELINES": {
+        "mriqc": {
+            "VERSION": "23.1.0",
+            "CONTAINER": "mriqc_{}.sif",
+            "URL": ""
+        },
+        "fmriprep": {
+            "VERSION": "23.1.3",
+            "CONTAINER": "fmriprep_{}.sif",
+            "URL": ""
+        },
+        "freesurfer": {
+            "VERSION": "7.3.2",
+            "CONTAINER": "fmriprep_{}.sif",
+            "URL": ""
+        }
+    },
+
+    "TABULAR": {
+        "data_dictionary": {
+            "PATH": "",
+            "VERSION": "",
+            "URL": ""
+        }
+    },
+
+    "WORKFLOWS": []
+}
diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
new file mode 100644
index 0000000..d4013d4
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -0,0 +1,35 @@
+{
+    "params_methods" : {
+        "W": 12,
+        "n_overlap": 1.0,
+        "sw_method": "pear_corr",
+        "tapered_window": true,
+        "TF_method": "WTC",
+        "clstr_base_measure": "SlidingWindow",
+        "hmm_iter": 20,
+        "dhmm_obs_state_ratio": 0.666,
+        "n_states": 5,
+        "n_subj_clstrs": 10,
+        "n_jobs": 2,
+        "verbose": 0,
+        "backend": "loky",
+        "normalization": true,
+        "num_subj": null,
+        "num_time_point": null
+    },
+    "MEASURES_name_lst" : [
+        "SlidingWindow",
+        "Time-Freq",
+        "CAP",
+        "ContinuousHMM",
+        "Windowless",
+        "Clustering",
+        "DiscreteHMM"
+    ],
+    "alter_hparams" : [],
+    "params_multi_analysis" : {
+        "n_jobs": null,
+        "verbose": 0,
+        "backend": "loky"
+    }
+}
diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
new file mode 100644
index 0000000..a84c578
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/fcs_out.txt
+#$ -e logs/fcs_err.txt
+#$ -l h_vmem=64G
+#$ -q origami.q
+
+DATASET_INFO="./dataset_info.json"
+METHODS_CONFIG="./methods_config.json"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \
+--dataset_info $DATASET_INFO \
+--methods_config $METHODS_CONFIG
+
+conda deactivate
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
new file mode 100644
index 0000000..4ec431a
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/ML_out.txt
+#$ -e logs/ML_err.txt
+#$ -l h_vmem=64G
+#$ -q origami.q
+
+DATASET_INFO="./dataset_info.json"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \
+--dataset_info $DATASET_INFO
+
+conda deactivate
diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
new file mode 100644
index 0000000..124dc1f
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/dfc_out.txt
+#$ -e logs/dfc_err.txt
+#$ -l h_vmem=32G
+#$ -q origami.q
+
+SUBJECT_LIST="./subj_list.txt"
+DATASET_INFO="./dataset_info.json"
+
+echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+
+SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \
+--dataset_info $DATASET_INFO \
+--participant_id $SUBJECT_ID
+
+conda deactivate
diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
new file mode 100644
index 0000000..7197245
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+#SBATCH --job-name=fmriprep_job       # Name of the job
+#SBATCH --output=logs/fmriprep_out.log  # Standard output log
+#SBATCH --error=logs/fmriprep_err.log   # Standard error log
+#SBATCH --time=24:00:00                # Walltime (24 hours)
+#SBATCH --mem=32G                      # Memory (32 GB)
+#SBATCH --cpus-per-task=1              # Number of CPU cores per task
+#SBATCH --account=rrg-jbpoline           # Account
+
+source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/nipoppy_env/bin/activate"
+
+SUBJECT_LIST="./subj_list.txt"
+
+echo "Number subjects found: $(wc -l < $SUBJECT_LIST)"
+
+SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST)
+echo "Subject ID: $SUBJECT_ID"
+
+nipoppy run \
+-pipeline fmriprep \
+--participant_id $SUBJECT_ID
+
+deactivate
diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
new file mode 100644
index 0000000..1fff1da
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/roi_out.txt
+#$ -e logs/roi_err.txt
+#$ -l h_vmem=32G
+#$ -q origami.q
+
+SUBJECT_LIST="./subj_list.txt"
+DATASET_INFO="./dataset_info.json"
+
+echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+
+SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
+--dataset_info $DATASET_INFO \
+--participant_id $SUBJECT_ID
+
+conda deactivate
diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
new file mode 100644
index 0000000..2a00cc5
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+#
+#$ -cwd
+#$ -o logs/report_out.txt
+#$ -e logs/report_err.txt
+#$ -l h_vmem=16G
+#$ -q origami.q
+
+DATASET_INFO="./dataset_info.json"
+SUBJ_LIST="./subj_list.txt"
+
+source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
+conda activate pydfc
+python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \
+--dataset_info $DATASET_INFO \
+--subj_list $SUBJ_LIST
+
+conda deactivate

From 4d2aea2fa6abf0ceda2fad60d19f9ce42ed8fe9e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Sep 2024 13:01:10 -0400
Subject: [PATCH 112/401] new global config

---
 .../run_scripts_slurm/global_configs.json     | 197 ++++++++++++++----
 1 file changed, 151 insertions(+), 46 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/global_configs.json b/task_dFC/run_scripts_slurm/global_configs.json
index 44a524c..252968f 100644
--- a/task_dFC/run_scripts_slurm/global_configs.json
+++ b/task_dFC/run_scripts_slurm/global_configs.json
@@ -1,54 +1,159 @@
 {
-    "DATASET_NAME": "",
-    "DATASET_ROOT": "/home/mt00/scratch/DATA/task-based/openneuro//",
-
-    "CONTAINER_STORE": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/",
-
-    "SINGULARITY_PATH": "singularity",
-
-    "TEMPLATEFLOW_DIR": "/home/mt00/projects/def-jbpoline/templateflow",
-
-    "SESSIONS": [],
-    "VISITS": [],
-
-    "BIDS": {
-        "heudiconv": {
-            "VERSION": "0.11.6",
-            "CONTAINER": "heudiconv_{}.sif",
-            "URL": ""
-        },
-        "validator":{
-            "CONTAINER": "bids_validator.sif",
-            "URL": ""
-
-        }
+    "DATASET_NAME": "<DATASET_NAME>",
+    "VISIT_IDS": [],
+    "SESSION_IDS": [],
+    "SUBSTITUTIONS": {
+        "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/",
+        "[[HEUDICONV_HEURISTIC_FILE]]": "",
+        "[[DCM2BIDS_CONFIG_FILE]]": "",
+        "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/rrg-jbpoline/mt00/freesurfer/",
+        "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow"
     },
-
-    "PROC_PIPELINES": {
-        "mriqc": {
-            "VERSION": "23.1.0",
-            "CONTAINER": "mriqc_{}.sif",
-            "URL": ""
+    "DICOM_DIR_PARTICIPANT_FIRST": true,
+    "CONTAINER_CONFIG": {
+        "COMMAND": "apptainer",
+        "ARGS": [
+            "--cleanenv"
+        ]
+    },
+    "BIDS_PIPELINES": [
+        {
+            "NAME": "heudiconv",
+            "VERSION": "0.12.2",
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://nipy/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "STEPS": [
+                {
+                    "NAME": "prepare",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+                },
+                {
+                    "NAME": "convert",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json",
+                    "CONTAINER_CONFIG": {
+                        "ARGS": [
+                            "--bind",
+                            "[[HEUDICONV_HEURISTIC_FILE]]"
+                        ]
+                    },
+                    "UPDATE_DOUGHNUT": true
+                }
+            ]
+        },
+        {
+            "NAME": "dcm2bids",
+            "VERSION": "3.1.0",
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://unfmontreal/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "STEPS": [
+                {
+                    "NAME": "prepare",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/dcm2bids_helper-[[PIPELINE_VERSION]].json"
+                },
+                {
+                    "NAME": "convert",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/dcm2bids-[[PIPELINE_VERSION]].json",
+                    "CONTAINER_CONFIG": {
+                        "ARGS": [
+                            "--bind",
+                            "[[DCM2BIDS_CONFIG_FILE]]"
+                        ]
+                    },
+                    "UPDATE_DOUGHNUT": true
+                }
+            ]
         },
-        "fmriprep": {
+        {
+            "NAME": "bidscoin",
+            "VERSION": "4.3.2",
+            "STEPS": [
+                {
+                    "NAME": "prepare",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidsmapper-[[PIPELINE_VERSION]].json",
+                    "ANALYSIS_LEVEL": "group"
+                },
+                {
+                    "NAME": "edit",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidseditor-[[PIPELINE_VERSION]].json",
+                    "ANALYSIS_LEVEL": "group"
+                },
+                {
+                    "NAME": "convert",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidscoiner-[[PIPELINE_VERSION]].json",
+                    "ANALYSIS_LEVEL": "participant",
+                    "UPDATE_DOUGHNUT": true
+                }
+            ]
+        }
+    ],
+    "PROC_PIPELINES": [
+        {
+            "NAME": "fmriprep",
             "VERSION": "23.1.3",
-            "CONTAINER": "fmriprep_{}.sif",
-            "URL": ""
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://nipreps/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "CONTAINER_CONFIG": {
+                "ENV_VARS": {
+                    "TEMPLATEFLOW_HOME": "[[TEMPLATEFLOW_HOME]]"
+                },
+                "ARGS": [
+                    "--bind",
+                    "[[FREESURFER_LICENSE_FILE]]",
+                    "--bind",
+                    "[[TEMPLATEFLOW_HOME]]"
+                ]
+            },
+            "STEPS": [
+                {
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+                }
+            ],
+            "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
         },
-        "freesurfer": {
+        {
+            "NAME": "freesurfer",
             "VERSION": "7.3.2",
-            "CONTAINER": "fmriprep_{}.sif",
-            "URL": ""
-        }
-    },
-
-    "TABULAR": {
-        "data_dictionary": {
-            "PATH": "",
-            "VERSION": "",
-            "URL": ""
+            "DESCRIPTION": "Freesurfer version associated with fMRIPrep 23.1.3",
+            "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+        },
+        {
+            "NAME": "mriqc",
+            "VERSION": "23.1.0",
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://nipreps/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "CONTAINER_CONFIG": {
+                "ENV_VARS": {
+                    "TEMPLATEFLOW_HOME": "[[TEMPLATEFLOW_HOME]]"
+                },
+                "ARGS": [
+                    "--bind",
+                    "[[TEMPLATEFLOW_HOME]]"
+                ]
+            },
+            "STEPS": [
+                {
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+                }
+            ],
+            "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
         }
-    },
-
-    "WORKFLOWS": []
+    ],
+    "CUSTOM": {}
 }

From a8a36ced23817184859ec88e04f8d00de85dfc30 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Sep 2024 13:07:22 -0400
Subject: [PATCH 113/401] minor change

---
 .../run_scripts_slurm/{global_configs.json => global_config.json} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename task_dFC/run_scripts_slurm/{global_configs.json => global_config.json} (100%)

diff --git a/task_dFC/run_scripts_slurm/global_configs.json b/task_dFC/run_scripts_slurm/global_config.json
similarity index 100%
rename from task_dFC/run_scripts_slurm/global_configs.json
rename to task_dFC/run_scripts_slurm/global_config.json

From aa6b724eab15221d63516ffe171982e5f536b90a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Sep 2024 13:36:18 -0400
Subject: [PATCH 114/401] minor change

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 7197245..3f2aa99 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -18,7 +18,8 @@ SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST)
 echo "Subject ID: $SUBJECT_ID"
 
 nipoppy run \
--pipeline fmriprep \
+--pipeline fmriprep \
+--dataset-root "$(dirname "$(pwd)")" \
 --participant_id $SUBJECT_ID
 
 deactivate

From 43686e3b8b99f9a8c06626b1065432de6c29b30c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Sep 2024 13:44:14 -0400
Subject: [PATCH 115/401] minor fix

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 3f2aa99..1e90631 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -20,6 +20,6 @@ echo "Subject ID: $SUBJECT_ID"
 nipoppy run \
 --pipeline fmriprep \
 --dataset-root "$(dirname "$(pwd)")" \
---participant_id $SUBJECT_ID
+--participant-id $SUBJECT_ID
 
 deactivate

From 5fe74669c4513070883c11ff9cf4ec8485069b93 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 19 Sep 2024 12:42:52 -0400
Subject: [PATCH 116/401] add simul to slurm

---
 .../run_simulator.sh                            |  0
 simul_dFC/run_scripts_slurm/run_simulator.sh    | 17 +++++++++++++++++
 2 files changed, 17 insertions(+)
 rename simul_dFC/{run_scripts => run_scripts_sge}/run_simulator.sh (100%)
 create mode 100644 simul_dFC/run_scripts_slurm/run_simulator.sh

diff --git a/simul_dFC/run_scripts/run_simulator.sh b/simul_dFC/run_scripts_sge/run_simulator.sh
similarity index 100%
rename from simul_dFC/run_scripts/run_simulator.sh
rename to simul_dFC/run_scripts_sge/run_simulator.sh
diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
new file mode 100644
index 0000000..b363a7f
--- /dev/null
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+#SBATCH --job-name=simul_dfc_job   # Optional: Name of your job
+#SBATCH --output=logs/simul_out.txt  # Standard output log
+#SBATCH --error=logs/simul_err.txt   # Standard error log
+#SBATCH --account=rrg-jbpoline           # Account
+#SBATCH --mem=8G                     # Memory request per node
+#SBATCH --array=1-200                # Task array specification
+
+# Activate  virtual environment
+source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/pydfc/bin/activate"
+
+# Run Python script
+python "/home/mt00/projects/rrg-jbpoline/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py"
+
+# Deactivate environment
+deactivate

From 3e0ae70237864a20c7a49ce94328b4e2dc936def Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 19 Sep 2024 17:41:27 -0400
Subject: [PATCH 117/401] add PCA to embedding

---
 pydfc/ml_utils.py           | 365 +++++++++++++++++++-----------------
 task_dFC/ML.py              |   2 +
 task_dFC/generate_report.py | 309 +++++++++++++++---------------
 3 files changed, 352 insertions(+), 324 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 693e881..e7cce64 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -819,6 +819,11 @@ def embed_dFC_features(
 
     LE_embedding_method: "concat+embed" or "embed+procrustes"
     """
+    # make a copy of the data
+    X_train = X_train.copy()
+    if X_test is not None:
+        X_test = X_test.copy()
+
     if embedding == "PCA":
         # if n_components is not specified, use 95% of the variance
         if n_components == "auto":
@@ -1077,54 +1082,7 @@ def task_presence_classification(
         )
     )
 
-    # embed dFC features
-    X_train, X_test = embed_dFC_features(
-        train_subjects=train_subjects,
-        test_subjects=test_subjects,
-        X_train=X_train,
-        X_test=X_test,
-        y_train=y_train,
-        y_test=y_test,
-        subj_label_train=subj_label_train,
-        subj_label_test=subj_label_test,
-        embedding="LE",
-        n_components="auto",
-        n_neighbors_LE=125,
-        LE_embedding_method="embed+procrustes",
-    )
-
-    # task presence classification
-
-    print("task presence classification ...")
-
-    # logistic regression
-    log_reg_RESULT = logistic_regression_classify(X_train, y_train, X_test, y_test)
-
-    # KNN
-    KNN_RESULT = KNN_classify(X_train, y_train, X_test, y_test)
-
-    # # Random Forest
-    # RF_RESULT = random_forest_classify(
-    #     X_train, y_train, X_test, y_test
-    # )
-
-    # # Gradient Boosting
-    # GBT_RESULT = gradient_boosting_classify(
-    #     X_train, y_train, X_test, y_test
-    # )
-
     ML_RESULT = {}
-    for key in log_reg_RESULT:
-        ML_RESULT[key] = log_reg_RESULT[key]
-    for key in KNN_RESULT:
-        ML_RESULT[key] = KNN_RESULT[key]
-    # for key in RF_RESULT:
-    #     ML_RESULT[key] = RF_RESULT[key]
-    # for key in GBT_RESULT:
-    #     ML_RESULT[key] = GBT_RESULT[key]
-
-    # measure pred score on each subj
-
     ML_scores = {
         "subj_id": list(),
         "group": list(),
@@ -1135,42 +1093,94 @@ def task_presence_classification(
         "KNN accuracy": list(),
         # "Random Forest accuracy": list(),
         # "Gradient Boosting accuracy": list(),
+        "embedding": list(),
     }
-    log_reg = log_reg_RESULT["log_reg_model"]
-    KNN = KNN_RESULT["KNN_model"]
-    # RF = RF_RESULT["RF_model"]
-    # GBT = GBT_RESULT["GB_model"]
-
-    for subj in SUBJECTS:
-        ML_scores["subj_id"].append(subj)
-        if subj in train_subjects:
-            ML_scores["group"].append("train")
-            features = X_train[subj_label_train == subj, :]
-            target = y_train[subj_label_train == subj]
-        elif subj in test_subjects:
-            ML_scores["group"].append("test")
-            features = X_test[subj_label_test == subj, :]
-            target = y_test[subj_label_test == subj]
-
-        pred_lr = log_reg.predict(features)
-        pred_KNN = KNN.predict(features)
-        # pred_RF = RF.predict(features)
-        # pred_GBT = GBT.predict(features)
-
-        ML_scores["Logistic regression accuracy"].append(
-            balanced_accuracy_score(target, pred_lr)
+    for embedding in ["PCA", "LE"]:
+        # embed dFC features
+        X_train_embedded, X_test_embedded = embed_dFC_features(
+            train_subjects=train_subjects,
+            test_subjects=test_subjects,
+            X_train=X_train,
+            X_test=X_test,
+            y_train=y_train,
+            y_test=y_test,
+            subj_label_train=subj_label_train,
+            subj_label_test=subj_label_test,
+            embedding=embedding,
+            n_components="auto",
+            n_neighbors_LE=125,
+            LE_embedding_method="embed+procrustes",
         )
-        ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
-        # ML_scores["Random Forest accuracy"].append(
-        #     balanced_accuracy_score(target, pred_RF)
+
+        # task presence classification
+
+        print("task presence classification ...")
+
+        # logistic regression
+        log_reg_RESULT = logistic_regression_classify(
+            X_train_embedded, y_train, X_test_embedded, y_test
+        )
+
+        # KNN
+        KNN_RESULT = KNN_classify(X_train_embedded, y_train, X_test_embedded, y_test)
+
+        # # Random Forest
+        # RF_RESULT = random_forest_classify(
+        #     X_train_embedded, y_train, X_test_embedded, y_test
         # )
-        # ML_scores["Gradient Boosting accuracy"].append(
-        #     balanced_accuracy_score(target, pred_GBT)
+
+        # # Gradient Boosting
+        # GBT_RESULT = gradient_boosting_classify(
+        #     X_train_embedded, y_train, X_test_embedded, y_test
         # )
 
-        ML_scores["task"].append(task)
-        ML_scores["run"].append(run)
-        ML_scores["dFC method"].append(measure_name)
+        ML_RESULT[embedding] = {}
+        for key in log_reg_RESULT:
+            ML_RESULT[embedding][key] = log_reg_RESULT[key]
+        for key in KNN_RESULT:
+            ML_RESULT[embedding][key] = KNN_RESULT[key]
+        # for key in RF_RESULT:
+        #     ML_RESULT[embedding][key] = RF_RESULT[key]
+        # for key in GBT_RESULT:
+        #     ML_RESULT[embedding][key] = GBT_RESULT[key]
+
+        # measure pred score on each subj
+        log_reg = log_reg_RESULT["log_reg_model"]
+        KNN = KNN_RESULT["KNN_model"]
+        # RF = RF_RESULT["RF_model"]
+        # GBT = GBT_RESULT["GB_model"]
+
+        for subj in SUBJECTS:
+            ML_scores["subj_id"].append(subj)
+            if subj in train_subjects:
+                ML_scores["group"].append("train")
+                features = X_train_embedded[subj_label_train == subj, :]
+                target = y_train[subj_label_train == subj]
+            elif subj in test_subjects:
+                ML_scores["group"].append("test")
+                features = X_test_embedded[subj_label_test == subj, :]
+                target = y_test[subj_label_test == subj]
+
+            pred_lr = log_reg.predict(features)
+            pred_KNN = KNN.predict(features)
+            # pred_RF = RF.predict(features)
+            # pred_GBT = GBT.predict(features)
+
+            ML_scores["Logistic regression accuracy"].append(
+                balanced_accuracy_score(target, pred_lr)
+            )
+            ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
+            # ML_scores["Random Forest accuracy"].append(
+            #     balanced_accuracy_score(target, pred_RF)
+            # )
+            # ML_scores["Gradient Boosting accuracy"].append(
+            #     balanced_accuracy_score(target, pred_GBT)
+            # )
+
+            ML_scores["task"].append(task)
+            ML_scores["run"].append(run)
+            ML_scores["dFC method"].append(measure_name)
+            ML_scores["embedding"].append(embedding)
 
     return ML_RESULT, ML_scores
 
@@ -1214,49 +1224,7 @@ def task_presence_clustering(
         normalize_dFC=normalize_dFC,
     )
 
-    # embed dFC features
-    X, _ = embed_dFC_features(
-        train_subjects=SUBJECTS,
-        test_subjects=[],
-        X_train=X,
-        X_test=None,
-        y_train=y,
-        y_test=None,
-        subj_label_train=subj_label,
-        subj_label_test=None,
-        embedding="LE",
-        n_components="auto",
-        n_neighbors_LE=125,
-        LE_embedding_method="embed+procrustes",
-    )
-
-    # clustering
-    # apply kmeans clustering to dFC features
-
-    n_clusters = 2  # corresponding to task and rest
-
-    scaler = StandardScaler()
-    X_normalized = scaler.fit_transform(X)
-    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-    labels_pred = kmeans.fit_predict(X_normalized)
-
-    # ARI score
-    print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
-
-    # # visualize clustering centroids
-    # centroids = kmeans.cluster_centers_
-    # centroids = pca.inverse_transform(centroids)
-    # centroids = scaler.inverse_transform(centroids)
-    # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-    # centroids_mat = dFC_vec2mat(centroids, n_regions)
-
-    clustering_RESULTS = {
-        "StandardScaler": scaler,
-        "kmeans": kmeans,
-        "ARI": adjusted_rand_score(y, labels_pred),
-        # "centroids": centroids_mat,
-    }
-
+    clustering_RESULTS = {}
     clustering_scores = {
         "subj_id": list(),
         "task": list(),
@@ -1265,22 +1233,69 @@ def task_presence_clustering(
         "Kmeans ARI": list(),
         "SI": list(),
     }
-    for subj in SUBJECTS:
-        clustering_scores["subj_id"].append(subj)
-        features = X[subj_label == subj, :]
-        target = y[subj_label == subj]
+    for embedding in ["PCA", "LE"]:
+        # embed dFC features
+        X_embedded, _ = embed_dFC_features(
+            train_subjects=SUBJECTS,
+            test_subjects=[],
+            X_train=X,
+            X_test=None,
+            y_train=y,
+            y_test=None,
+            subj_label_train=subj_label,
+            subj_label_test=None,
+            embedding=embedding,
+            n_components="auto",
+            n_neighbors_LE=125,
+            LE_embedding_method="embed+procrustes",
+        )
+
+        # clustering
+        # apply kmeans clustering to dFC features
+
+        n_clusters = 2  # corresponding to task and rest
+
+        scaler = StandardScaler()
+        X_normalized = scaler.fit_transform(X_embedded)
+        kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
+        labels_pred = kmeans.fit_predict(X_normalized)
+
+        # ARI score
+        print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
+
+        # # visualize clustering centroids
+        # centroids = kmeans.cluster_centers_
+        # centroids = pca.inverse_transform(centroids)
+        # centroids = scaler.inverse_transform(centroids)
+        # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+        # centroids_mat = dFC_vec2mat(centroids, n_regions)
 
-        features_normalized = scaler.transform(features)
-        pred_kmeans = kmeans.predict(features_normalized)
+        clustering_RESULTS[embedding] = {
+            "StandardScaler": scaler,
+            "kmeans": kmeans,
+            "ARI": adjusted_rand_score(y, labels_pred),
+            # "centroids": centroids_mat,
+        }
 
-        clustering_scores["Kmeans ARI"].append(adjusted_rand_score(target, pred_kmeans))
+        for subj in SUBJECTS:
+            clustering_scores["subj_id"].append(subj)
+            features = X_embedded[subj_label == subj, :]
+            target = y[subj_label == subj]
 
-        # silhouette score in terms of separability of original labels, not the clustering labels
-        clustering_scores["SI"].append(silhouette_score(features, target))
+            features_normalized = scaler.transform(features)
+            pred_kmeans = kmeans.predict(features_normalized)
 
-        clustering_scores["task"].append(task)
-        clustering_scores["run"].append(run)
-        clustering_scores["dFC method"].append(measure_name)
+            clustering_scores["Kmeans ARI"].append(
+                adjusted_rand_score(target, pred_kmeans)
+            )
+
+            # silhouette score in terms of separability of original labels, not the clustering labels
+            clustering_scores["SI"].append(silhouette_score(features, target))
+
+            clustering_scores["task"].append(task)
+            clustering_scores["run"].append(run)
+            clustering_scores["dFC method"].append(measure_name)
+            clustering_scores["embedding"].append(embedding)
 
     return clustering_RESULTS, clustering_scores
 
@@ -1353,47 +1368,49 @@ def task_paradigm_clustering(
     y = y[idx]
     subj_label = subj_label[idx]
 
-    # embed dFC features
-    X_embed, _ = embed_dFC_features(
-        train_subjects=SUBJECTS,
-        test_subjects=[],
-        X_train=X,
-        X_test=None,
-        y_train=y,
-        y_test=None,
-        subj_label_train=subj_label,
-        subj_label_test=None,
-        embedding="LE",
-        n_components="auto",
-        n_neighbors_LE=125,
-        LE_embedding_method="embed+procrustes",
-    )
+    task_paradigm_clstr_RESULTS = {}
+    for embedding in ["PCA", "LE"]:
+        # embed dFC features
+        X_embed, _ = embed_dFC_features(
+            train_subjects=SUBJECTS,
+            test_subjects=[],
+            X_train=X,
+            X_test=None,
+            y_train=y,
+            y_test=None,
+            subj_label_train=subj_label,
+            subj_label_test=None,
+            embedding=embedding,
+            n_components="auto",
+            n_neighbors_LE=125,
+            LE_embedding_method="embed+procrustes",
+        )
 
-    # clustering
-    # apply kmeans clustering to dFC features
-
-    n_clusters = len(TASKS)  # corresponding to task paradigms
-
-    scaler = StandardScaler()
-    X_normalized = scaler.fit_transform(X_embed)
-    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-    labels_pred = kmeans.fit_predict(X_normalized)
-
-    # # visualize clustering centroids
-    # centroids = kmeans.cluster_centers_
-    # centroids = pca.inverse_transform(centroids)
-    # centroids = scaler.inverse_transform(centroids)
-    # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-    # centroids_mat = dFC_vec2mat(centroids, n_regions)
-
-    task_paradigm_clstr_RESULTS = {
-        "dFC_method": measure_name,
-        "StandardScaler": scaler,
-        "kmeans": kmeans,
-        "ARI": adjusted_rand_score(y, labels_pred),
-        "SI": silhouette_score(X_normalized, y),
-        # "centroids": centroids_mat,
-        "task_paradigms": TASKS,
-    }
+        # clustering
+        # apply kmeans clustering to dFC features
+
+        n_clusters = len(TASKS)  # corresponding to task paradigms
+
+        scaler = StandardScaler()
+        X_normalized = scaler.fit_transform(X_embed)
+        kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
+        labels_pred = kmeans.fit_predict(X_normalized)
+
+        # # visualize clustering centroids
+        # centroids = kmeans.cluster_centers_
+        # centroids = pca.inverse_transform(centroids)
+        # centroids = scaler.inverse_transform(centroids)
+        # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+        # centroids_mat = dFC_vec2mat(centroids, n_regions)
+
+        task_paradigm_clstr_RESULTS[embedding] = {
+            "dFC_method": measure_name,
+            "StandardScaler": scaler,
+            "kmeans": kmeans,
+            "ARI": adjusted_rand_score(y, labels_pred),
+            "SI": silhouette_score(X_normalized, y),
+            # "centroids": centroids_mat,
+            "task_paradigms": TASKS,
+        }
 
     return task_paradigm_clstr_RESULTS
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index a792130..9a473dd 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -72,6 +72,7 @@ def run_classification(
             "KNN accuracy": list(),
             # "Random Forest accuracy": list(),
             # "Gradient Boosting accuracy": list(),
+            "embedding": list(),
         }
 
         ML_RESULT = {}
@@ -129,6 +130,7 @@ def run_clustering(
             "dFC method": list(),
             "Kmeans ARI": list(),
             "SI": list(),
+            "embedding": list(),
         }
 
         clustering_RESULTS = {}
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 4c99d88..301cb5e 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -452,7 +452,13 @@ def plot_dFC_matrices(
 
 
 def plot_ML_results(
-    ML_root, output_root, task, run=None, session=None, ML_algorithm="Random Forest"
+    ML_root,
+    output_root,
+    task,
+    run=None,
+    session=None,
+    ML_algorithm="Random Forest",
+    embedding="PCA",
 ):
     """
     Plot the ML results for a given task, run and session.
@@ -464,6 +470,7 @@ def plot_ML_results(
         run: int, run number
         session: str, session name
         ML_algorithm: str, ML algorithm name (default: Random Forest, other options: Logistic regression, KNN, Gradient Boosting)
+        embedding: str, embedding method (default: PCA, other options: LE)
     """
     # the ML_scores files are saved as ML_scores_classify_{dFC_id}.npy
     # find all the ML_scores files in the directory
@@ -493,10 +500,13 @@ def plot_ML_results(
     if run is not None:
         dataframe = dataframe[dataframe["run"] == run]
 
+    dataframe = dataframe[dataframe["task"] == task]
+    dataframe = dataframe[dataframe["embedding"] == embedding]
+
     plt.figure(figsize=(10, 5))
 
     g = sns.pointplot(
-        data=dataframe[dataframe["task"] == task],
+        data=dataframe,
         x="dFC method",
         y=f"{ML_algorithm} accuracy",
         hue="group",
@@ -532,7 +542,7 @@ def plot_ML_results(
 
     if run is None:
         plt.savefig(
-            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}.{save_fig_format}",
+            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{embedding}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -540,7 +550,7 @@ def plot_ML_results(
         )
     else:
         plt.savefig(
-            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}.{save_fig_format}",
+            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}_{embedding}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -550,7 +560,9 @@ def plot_ML_results(
     plt.close()
 
 
-def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
+def plot_clustering_results(
+    ML_root, output_root, task, run=None, session=None, embedding="PCA"
+):
     """
     Plot the clustering results for a given task, run and session.
     parameters:
@@ -560,6 +572,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         task: str, task name
         run: int, run number
         session: str, session name
+        embedding: str, embedding method (default: PCA, other options: LE)
     """
     # the clustering_scores files are saved as clustering_scores_{dFC_id}.npy
     # find all the clustering_scores files in the directory
@@ -593,10 +606,13 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
     if run is not None:
         dataframe = dataframe[dataframe["run"] == run]
 
+    dataframe = dataframe[dataframe["task"] == task]
+    dataframe = dataframe[dataframe["embedding"] == embedding]
+
     # plot ARI score
     plt.figure(figsize=(10, 5))
     g = sns.pointplot(
-        data=dataframe[dataframe["task"] == task],
+        data=dataframe,
         x="dFC method",
         y="Kmeans ARI",
         errorbar="sd",
@@ -621,7 +637,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
 
     if run is None:
         plt.savefig(
-            f"{output_dir}/clustering_results_ARI_{task}.{save_fig_format}",
+            f"{output_dir}/clustering_results_ARI_{task}_{embedding}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -629,7 +645,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         )
     else:
         plt.savefig(
-            f"{output_dir}/clustering_results_ARI_{task}_{run}.{save_fig_format}",
+            f"{output_dir}/clustering_results_ARI_{task}_{run}_{embedding}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -641,7 +657,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
     # plot SI score
     plt.figure(figsize=(10, 5))
     g = sns.pointplot(
-        data=dataframe[dataframe["task"] == task],
+        data=dataframe,
         x="dFC method",
         y="SI",
         errorbar="sd",
@@ -664,7 +680,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
 
     if run is None:
         plt.savefig(
-            f"{output_dir}/clustering_results_SI_{task}.{save_fig_format}",
+            f"{output_dir}/clustering_results_SI_{task}_{embedding}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -672,7 +688,7 @@ def plot_clustering_results(ML_root, output_root, task, run=None, session=None):
         )
     else:
         plt.savefig(
-            f"{output_dir}/clustering_results_SI_{task}_{run}.{save_fig_format}",
+            f"{output_dir}/clustering_results_SI_{task}_{run}_{embedding}.{save_fig_format}",
             dpi=fig_dpi,
             bbox_inches=fig_bbox_inches,
             pad_inches=fig_pad,
@@ -686,6 +702,7 @@ def plot_paradigm_clustering_score(
     ML_root,
     output_root,
     session=None,
+    embedding="PCA",
 ):
     """
     Plot the clustering results for a given task, run and session.
@@ -696,6 +713,7 @@ def plot_paradigm_clustering_score(
         task: str, task name
         run: int, run number
         session: str, session name
+        embedding: str, embedding method (default: PCA, other options: LE)
     """
     # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
     # find all the paradigm_clustering_RESULTS files in the directory
@@ -720,13 +738,13 @@ def plot_paradigm_clustering_score(
             f"{input_dir}/{result_file}", allow_pickle="TRUE"
         ).item()
         paradigm_clustering_RESULTS["dFC method"].append(
-            paradigm_clustering_RESULTS_new["dFC_method"]
+            paradigm_clustering_RESULTS_new[embedding]["dFC_method"]
         )
         paradigm_clustering_RESULTS["ARI score"].append(
-            paradigm_clustering_RESULTS_new["ARI"]
+            paradigm_clustering_RESULTS_new[embedding]["ARI"]
         )
         paradigm_clustering_RESULTS["SI score"].append(
-            paradigm_clustering_RESULTS_new["SI"]
+            paradigm_clustering_RESULTS_new[embedding]["SI"]
         )
 
     sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
@@ -764,7 +782,7 @@ def plot_paradigm_clustering_score(
         os.makedirs(output_dir)
 
     plt.savefig(
-        f"{output_dir}/paradigm_clustering_results_ARI.{save_fig_format}",
+        f"{output_dir}/paradigm_clustering_results_ARI_{embedding}.{save_fig_format}",
         dpi=fig_dpi,
         bbox_inches=fig_bbox_inches,
         pad_inches=fig_pad,
@@ -801,7 +819,7 @@ def plot_paradigm_clustering_score(
         os.makedirs(output_dir)
 
     plt.savefig(
-        f"{output_dir}/paradigm_clustering_results_SI.{save_fig_format}",
+        f"{output_dir}/paradigm_clustering_results_SI_{embedding}.{save_fig_format}",
         dpi=fig_dpi,
         bbox_inches=fig_bbox_inches,
         pad_inches=fig_pad,
@@ -1225,45 +1243,41 @@ def create_html_report_group_results(
                 else:
                     classification_dir = f"{group_dir}/classification"
 
-                # display Random Forest classification results
-                file.write("<h3>KNN</h3>\n")
-                if run is None:
-                    classification_img = (
-                        f"{classification_dir}/ML_results_classify_KNN_{task}.png"
+                for embedding in ["PCA", "LE"]:
+                    file.write(f"<h3>{embedding}</h3>\n")
+                    # display KNN classification results
+                    file.write("<h3>KNN</h3>\n")
+                    if run is None:
+                        classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{embedding}.png"
+                    else:
+                        classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{run}_{embedding}.png"
+                    img = plt.imread(classification_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    classification_img = classification_img.replace(group_dir, ".")
+                    file.write(
+                        f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
                     )
-                else:
-                    classification_img = (
-                        f"{classification_dir}/ML_results_classify_KNN_{task}_{run}.png"
-                    )
-                img = plt.imread(classification_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                classification_img = classification_img.replace(group_dir, ".")
-                file.write(
-                    f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                )
 
-                # display Logistic regression classification results
-                file.write("<h3>Logistic Regression</h3>\n")
-                if run is None:
-                    classification_img = (
-                        f"{classification_dir}/ML_results_classify_LogReg_{task}.png"
+                    # display Logistic regression classification results
+                    file.write("<h3>Logistic Regression</h3>\n")
+                    if run is None:
+                        classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{embedding}.png"
+                    else:
+                        classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}_{embedding}.png"
+                    img = plt.imread(classification_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    classification_img = classification_img.replace(group_dir, ".")
+                    file.write(
+                        f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
                     )
-                else:
-                    classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}.png"
-                img = plt.imread(classification_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                classification_img = classification_img.replace(group_dir, ".")
-                file.write(
-                    f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                )
 
-                file.write("<br>\n")
+                    file.write("<br>\n")
 
     # clustering results
     img_height = 300
@@ -1281,43 +1295,41 @@ def create_html_report_group_results(
                 else:
                     clustering_dir = f"{group_dir}/clustering"
 
-                # display clustering ARI results
-                if run is None:
-                    clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}.png"
-                else:
-                    clustering_img = (
-                        f"{clustering_dir}/clustering_results_ARI_{task}_{run}.png"
+                for embedding in ["PCA", "LE"]:
+                    file.write(f"<h3>{embedding}</h3>\n")
+                    # display clustering ARI results
+                    if run is None:
+                        clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{embedding}.png"
+                    else:
+                        clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{run}_{embedding}.png"
+                    img = plt.imread(clustering_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    clustering_img = clustering_img.replace(group_dir, ".")
+                    file.write(
+                        f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
                     )
-                img = plt.imread(clustering_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                clustering_img = clustering_img.replace(group_dir, ".")
-                file.write(
-                    f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
-                )
 
-                file.write("<br>\n")
-
-                # display clustering SI results
-                if run is None:
-                    clustering_img = f"{clustering_dir}/clustering_results_SI_{task}.png"
-                else:
-                    clustering_img = (
-                        f"{clustering_dir}/clustering_results_SI_{task}_{run}.png"
+                    file.write("<br>\n")
+
+                    # display clustering SI results
+                    if run is None:
+                        clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{embedding}.png"
+                    else:
+                        clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{run}_{embedding}.png"
+                    img = plt.imread(clustering_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    clustering_img = clustering_img.replace(group_dir, ".")
+                    file.write(
+                        f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
                     )
-                img = plt.imread(clustering_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                clustering_img = clustering_img.replace(group_dir, ".")
-                file.write(
-                    f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
-                )
 
-                file.write("<br>\n")
+                    file.write("<br>\n")
 
     # paradigm clustering results
     file.write("<h1>Paradigm Clustering Results</h1>\n")
@@ -1332,38 +1344,38 @@ def create_html_report_group_results(
         # display paradigm clustering ARI scores
         img_height = 300
         file.write("<h2>Paradigm Clustering ARI Scores</h2>\n")
-        paradigm_clustering_img = (
-            f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI.png"
-        )
-        img = plt.imread(paradigm_clustering_img)
-        height, width, _ = img.shape
-        # change the width so that height equals img_height
-        width = int(width * img_height / height)
-        # replace the path to the image with a relative path
-        paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
-        file.write(
-            f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
-        )
+        for embedding in ["PCA", "LE"]:
+            file.write(f"<h3>{embedding}</h3>\n")
+            paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI_{embedding}.png"
+            img = plt.imread(paradigm_clustering_img)
+            height, width, _ = img.shape
+            # change the width so that height equals img_height
+            width = int(width * img_height / height)
+            # replace the path to the image with a relative path
+            paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
+            file.write(
+                f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
+            )
 
-        file.write("<br>\n")
+            file.write("<br>\n")
 
         # display paradigm clustering SI scores
         img_height = 300
         file.write("<h2>Paradigm Clustering SI Scores</h2>\n")
-        paradigm_clustering_img = (
-            f"{paradigm_clustering_dir}/paradigm_clustering_results_SI.png"
-        )
-        img = plt.imread(paradigm_clustering_img)
-        height, width, _ = img.shape
-        # change the width so that height equals img_height
-        width = int(width * img_height / height)
-        # replace the path to the image with a relative path
-        paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
-        file.write(
-            f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
-        )
+        for embedding in ["PCA", "LE"]:
+            file.write(f"<h3>{embedding}</h3>\n")
+            paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_SI_{embedding}.png"
+            img = plt.imread(paradigm_clustering_img)
+            height, width, _ = img.shape
+            # change the width so that height equals img_height
+            width = int(width * img_height / height)
+            # replace the path to the image with a relative path
+            paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
+            file.write(
+                f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
+            )
 
-        file.write("<br>\n")
+            file.write("<br>\n")
 
         # # display paradigm clustering centroids
         # img_height = 300
@@ -1590,14 +1602,16 @@ def create_html_report_group_results(
         except Exception as e:
             print(f"Error in plotting task presence features: {e}")
 
-        try:
-            plot_paradigm_clustering_score(
-                ML_root=ML_root,
-                output_root=reports_root,
-                session=session,
-            )
-        except Exception as e:
-            print(f"Error in plotting paradigm clustering scores: {e}")
+        for embedding in ["PCA", "LE"]:
+            try:
+                plot_paradigm_clustering_score(
+                    ML_root=ML_root,
+                    output_root=reports_root,
+                    session=session,
+                    embedding=embedding,
+                )
+            except Exception as e:
+                print(f"Error in plotting paradigm clustering scores: {e}")
 
         # try:
         #     plot_paradigm_clstr_centroids(
@@ -1610,38 +1624,33 @@ def create_html_report_group_results(
 
         for task in TASKS:
             for run in RUNS[task]:
-                try:
-                    plot_ML_results(
-                        ML_root=ML_root,
-                        output_root=reports_root,
-                        task=task,
-                        run=run,
-                        session=session,
-                        ML_algorithm="KNN",
-                    )
-                except Exception as e:
-                    print(f"Error in plotting ML results for KNN: {e}")
-                try:
-                    plot_ML_results(
-                        ML_root=ML_root,
-                        output_root=reports_root,
-                        task=task,
-                        run=run,
-                        session=session,
-                        ML_algorithm="Logistic regression",
-                    )
-                except Exception as e:
-                    print(f"Error in plotting ML results for Logistic regression: {e}")
-                try:
-                    plot_clustering_results(
-                        ML_root=ML_root,
-                        output_root=reports_root,
-                        task=task,
-                        run=run,
-                        session=session,
-                    )
-                except Exception as e:
-                    print(f"Error in plotting clustering results: {e}")
+                for embedding in ["PCA", "LE"]:
+                    for ML_algorithm in ["KNN", "Logistic regression"]:
+                        try:
+                            plot_ML_results(
+                                ML_root=ML_root,
+                                output_root=reports_root,
+                                task=task,
+                                run=run,
+                                session=session,
+                                ML_algorithm=ML_algorithm,
+                                embedding=embedding,
+                            )
+                        except Exception as e:
+                            print(
+                                f"Error in plotting ML results for {ML_algorithm} and {embedding}: {e}"
+                            )
+                    try:
+                        plot_clustering_results(
+                            ML_root=ML_root,
+                            output_root=reports_root,
+                            task=task,
+                            run=run,
+                            session=session,
+                            embedding=embedding,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting clustering results: {e}")
 
     # create html report
     try:

From dbe1def78aa6bedd5a9defa642247f8285d0ffbe Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 09:53:05 -0400
Subject: [PATCH 118/401] minor change

---
 pydfc/ml_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index e7cce64..b3d9f9f 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1232,6 +1232,7 @@ def task_presence_clustering(
         "dFC method": list(),
         "Kmeans ARI": list(),
         "SI": list(),
+        "embedding": list(),
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features

From 1dfc2f69875e6aea718e946b4c59786ee4d13b44 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 12:25:22 -0400
Subject: [PATCH 119/401] minor change

---
 simul_dFC/run_scripts_slurm/run_simulator.sh  | 6 +++---
 task_dFC/run_scripts_slurm/dataset_info.json  | 2 +-
 task_dFC/run_scripts_slurm/global_config.json | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
index b363a7f..eaf0194 100644
--- a/simul_dFC/run_scripts_slurm/run_simulator.sh
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -3,15 +3,15 @@
 #SBATCH --job-name=simul_dfc_job   # Optional: Name of your job
 #SBATCH --output=logs/simul_out.txt  # Standard output log
 #SBATCH --error=logs/simul_err.txt   # Standard error log
-#SBATCH --account=rrg-jbpoline           # Account
+#SBATCH --account=def-jbpoline           # Account
 #SBATCH --mem=8G                     # Memory request per node
 #SBATCH --array=1-200                # Task array specification
 
 # Activate  virtual environment
-source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/pydfc/bin/activate"
+source "/home/mt00/pydfc/bin/activate"
 
 # Run Python script
-python "/home/mt00/projects/rrg-jbpoline/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py"
+python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py"
 
 # Deactivate environment
 deactivate
diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json
index 16d775e..e466511 100644
--- a/task_dFC/run_scripts_slurm/dataset_info.json
+++ b/task_dFC/run_scripts_slurm/dataset_info.json
@@ -1,6 +1,6 @@
 {
 	"dataset" : "",
-	"main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}",
+	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}",
 	"fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output",
 	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
diff --git a/task_dFC/run_scripts_slurm/global_config.json b/task_dFC/run_scripts_slurm/global_config.json
index 252968f..0e0681c 100644
--- a/task_dFC/run_scripts_slurm/global_config.json
+++ b/task_dFC/run_scripts_slurm/global_config.json
@@ -6,7 +6,7 @@
         "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/",
         "[[HEUDICONV_HEURISTIC_FILE]]": "",
         "[[DCM2BIDS_CONFIG_FILE]]": "",
-        "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/rrg-jbpoline/mt00/freesurfer/",
+        "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/",
         "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow"
     },
     "DICOM_DIR_PARTICIPANT_FIRST": true,

From 8262e842b6ece752c477dd7bbed9feb29557c8dd Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 12:30:55 -0400
Subject: [PATCH 120/401] minor change

---
 simul_dFC/run_scripts_slurm/run_simulator.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
index eaf0194..d17f3bd 100644
--- a/simul_dFC/run_scripts_slurm/run_simulator.sh
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -8,7 +8,7 @@
 #SBATCH --array=1-200                # Task array specification
 
 # Activate  virtual environment
-source "/home/mt00/pydfc/bin/activate"
+source "/home/mt00/venvs/pydfc/bin/activate"
 
 # Run Python script
 python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py"

From 9bf88087fc3cb0a1034cf0495b4f64c3dd664e4b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 12:52:11 -0400
Subject: [PATCH 121/401] slurm changes

---
 simul_dFC/run_scripts_slurm/run_simulator.sh  |  1 +
 task_dFC/run_scripts_slurm/run_FCS.sh         | 20 ++++++++++---------
 task_dFC/run_scripts_slurm/run_ML.sh          | 20 ++++++++++---------
 task_dFC/run_scripts_slurm/run_dFC.sh         | 20 ++++++++++---------
 task_dFC/run_scripts_slurm/run_fmriprep.sh    |  5 ++---
 .../run_scripts_slurm/run_nifti_to_roi.sh     | 20 ++++++++++---------
 task_dFC/run_scripts_slurm/run_report.sh      | 20 ++++++++++---------
 7 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
index d17f3bd..f7f8998 100644
--- a/simul_dFC/run_scripts_slurm/run_simulator.sh
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -4,6 +4,7 @@
 #SBATCH --output=logs/simul_out.txt  # Standard output log
 #SBATCH --error=logs/simul_err.txt   # Standard error log
 #SBATCH --account=def-jbpoline           # Account
+#SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=8G                     # Memory request per node
 #SBATCH --array=1-200                # Task array specification
 
diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index a84c578..b4d3b52 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -1,18 +1,20 @@
 #!/bin/sh
 #
-#$ -cwd
-#$ -o logs/fcs_out.txt
-#$ -e logs/fcs_err.txt
-#$ -l h_vmem=64G
-#$ -q origami.q
+#SBATCH --job-name=fit_fcs_job   # Optional: Name of your job
+#SBATCH --output=logs/fcs_out.txt  # Standard output log
+#SBATCH --error=logs/fcs_err.txt   # Standard error log
+#SBATCH --account=def-jbpoline           # Account
+#SBATCH --time=96:00:00                # Walltime for each task (96 hours)
+#SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 METHODS_CONFIG="./methods_config.json"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \
+# Activate  virtual environment
+source "/home/mt00/venvs/pydfc/bin/activate"
+
+python "/home/mt00/pydfc/dFC/task_dFC/FCS_estimate.py" \
 --dataset_info $DATASET_INFO \
 --methods_config $METHODS_CONFIG
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index 4ec431a..ff4a52b 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -1,16 +1,18 @@
 #!/bin/sh
 #
-#$ -cwd
-#$ -o logs/ML_out.txt
-#$ -e logs/ML_err.txt
-#$ -l h_vmem=64G
-#$ -q origami.q
+#SBATCH --job-name=ML_job   # Optional: Name of your job
+#SBATCH --output=logs/ML_out.txt  # Standard output log
+#SBATCH --error=logs/ML_err.txt   # Standard error log
+#SBATCH --account=def-jbpoline           # Account
+#SBATCH --time=72:00:00                # Walltime for each task (72 hours)
+#SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \
+# Activate  virtual environment
+source "/home/mt00/venvs/pydfc/bin/activate"
+
+python "/home/mt00/pydfc/dFC/task_dFC/ML.py" \
 --dataset_info $DATASET_INFO
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
index 124dc1f..84edbb9 100644
--- a/task_dFC/run_scripts_slurm/run_dFC.sh
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -1,10 +1,11 @@
 #!/bin/sh
 #
-#$ -cwd
-#$ -o logs/dfc_out.txt
-#$ -e logs/dfc_err.txt
-#$ -l h_vmem=32G
-#$ -q origami.q
+#SBATCH --job-name=assess_dfc_job   # Optional: Name of your job
+#SBATCH --output=logs/dfc_out.txt  # Standard output log
+#SBATCH --error=logs/dfc_err.txt   # Standard error log
+#SBATCH --account=def-jbpoline           # Account
+#SBATCH --time=24:00:00                # Walltime for each task (24 hours)
+#SBATCH --mem=32G                     # Memory request per node
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
@@ -14,10 +15,11 @@ echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \
+# Activate  virtual environment
+source "/home/mt00/venvs/pydfc/bin/activate"
+
+python "/home/mt00/pydfc/dFC/task_dFC/dFC_assessment.py" \
 --dataset_info $DATASET_INFO \
 --participant_id $SUBJECT_ID
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 1e90631..ed9306f 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -5,10 +5,9 @@
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
 #SBATCH --time=24:00:00                # Walltime (24 hours)
 #SBATCH --mem=32G                      # Memory (32 GB)
-#SBATCH --cpus-per-task=1              # Number of CPU cores per task
-#SBATCH --account=rrg-jbpoline           # Account
+#SBATCH --account=def-jbpoline           # Account
 
-source "/home/mt00/projects/rrg-jbpoline/mt00/venvs/nipoppy_env/bin/activate"
+source "/home/mt00/venvs/nipoppy_env/bin/activate"
 
 SUBJECT_LIST="./subj_list.txt"
 
diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 1fff1da..0462e86 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -1,10 +1,11 @@
 #!/bin/sh
 #
-#$ -cwd
-#$ -o logs/roi_out.txt
-#$ -e logs/roi_err.txt
-#$ -l h_vmem=32G
-#$ -q origami.q
+#SBATCH --job-name=extract_roi_job   # Optional: Name of your job
+#SBATCH --output=logs/roi_out.txt  # Standard output log
+#SBATCH --error=logs/roi_err.txt   # Standard error log
+#SBATCH --account=def-jbpoline           # Account
+#SBATCH --time=24:00:00                # Walltime for each task (24 hours)
+#SBATCH --mem=32G                     # Memory request per node
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
@@ -14,10 +15,11 @@ echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
+# Activate  virtual environment
+source "/home/mt00/venvs/pydfc/bin/activate"
+
+python "/home/mt00/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
 --dataset_info $DATASET_INFO \
 --participant_id $SUBJECT_ID
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
index 2a00cc5..11167d8 100644
--- a/task_dFC/run_scripts_slurm/run_report.sh
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -1,18 +1,20 @@
 #!/bin/sh
 #
-#$ -cwd
-#$ -o logs/report_out.txt
-#$ -e logs/report_err.txt
-#$ -l h_vmem=16G
-#$ -q origami.q
+#SBATCH --job-name=report_job   # Optional: Name of your job
+#SBATCH --output=logs/report_out.txt  # Standard output log
+#SBATCH --error=logs/report_err.txt   # Standard error log
+#SBATCH --account=def-jbpoline           # Account
+#SBATCH --time=24:00:00                # Walltime for each task (24 hours)
+#SBATCH --mem=16G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 SUBJ_LIST="./subj_list.txt"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \
+# Activate  virtual environment
+source "/home/mt00/venvs/pydfc/bin/activate"
+
+python "/home/mt00/pydfc/dFC/task_dFC/generate_report.py" \
 --dataset_info $DATASET_INFO \
 --subj_list $SUBJ_LIST
 
-conda deactivate
+deactivate

From f0326c3d5d88286a6e9d6b2218d0b0a3261fc27b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 13:22:07 -0400
Subject: [PATCH 122/401] minor fix

---
 simul_dFC/task_data_simulator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index ba3b6c5..3dd8315 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -39,8 +39,10 @@
 # create a subject id list
 subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)]
 
-job_id = int(os.getenv("SGE_TASK_ID"))
-subj_id = subj_list[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
+job_id = int(os.getenv("SGE_TASK_ID"))  # for SGE
+if job_id is None:
+    job_id = int(os.getenv("SLURM_ARRAY_TASK_ID"))  # for SLURM
+subj_id = subj_list[job_id - 1]  # TASK_ID starts from 1 not 0
 
 print(f"subject-level simulation started running ... for subject: {subj_id} ...")
 

From 98caf6942789f3c902d43fa53f9b7df36430f4e2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 13:26:55 -0400
Subject: [PATCH 123/401] add SLURM_ARRAY_TASK_ID

---
 task_dFC/FCS_estimate.py | 6 ++++--
 task_dFC/ML.py           | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 064988c..0fc67ae 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -120,8 +120,10 @@ def run_FCS_estimate(
 
     TASKS = dataset_info["TASKS"]
 
-    job_id = int(os.getenv("SGE_TASK_ID"))
-    TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
+    job_id = int(os.getenv("SGE_TASK_ID"))  # for SGE
+    if job_id is None:
+        job_id = int(os.getenv("SLURM_ARRAY_TASK_ID"))  # for SLURM
+    TASK_id = job_id - 1  # TASK_ID starts from 1 not 0
     if TASK_id >= len(TASKS):
         print("TASK_id out of TASKS")
         exit()
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 9a473dd..4674f59 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -279,8 +279,10 @@ def run_task_paradigm_clustering(
         traceback.print_exc()
     print("Task features extraction finished.")
 
-    job_id = int(os.getenv("SGE_TASK_ID"))
-    dFC_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
+    job_id = int(os.getenv("SGE_TASK_ID"))  # for SGE
+    if job_id is None:
+        job_id = int(os.getenv("SLURM_ARRAY_TASK_ID"))  # for SLURM
+    dFC_id = job_id - 1  # TASK_ID starts from 1 not 0
 
     print(f"Task presence classification started for dFC ID {dFC_id}...")
     try:

From fa0632cfd6314d752f2e3083ef96488ae4da2b4a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 13:46:33 -0400
Subject: [PATCH 124/401] minor fix

---
 simul_dFC/task_data_simulator.py | 5 +++--
 task_dFC/FCS_estimate.py         | 5 +++--
 task_dFC/ML.py                   | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 3dd8315..5b081da 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -39,9 +39,10 @@
 # create a subject id list
 subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)]
 
-job_id = int(os.getenv("SGE_TASK_ID"))  # for SGE
+job_id = os.getenv("SGE_TASK_ID")  # for SGE
 if job_id is None:
-    job_id = int(os.getenv("SLURM_ARRAY_TASK_ID"))  # for SLURM
+    job_id = os.getenv("SLURM_ARRAY_TASK_ID")  # for SLURM
+job_id = int(job_id)
 subj_id = subj_list[job_id - 1]  # TASK_ID starts from 1 not 0
 
 print(f"subject-level simulation started running ... for subject: {subj_id} ...")
diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 0fc67ae..e54ef11 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -120,9 +120,10 @@ def run_FCS_estimate(
 
     TASKS = dataset_info["TASKS"]
 
-    job_id = int(os.getenv("SGE_TASK_ID"))  # for SGE
+    job_id = os.getenv("SGE_TASK_ID")  # for SGE
     if job_id is None:
-        job_id = int(os.getenv("SLURM_ARRAY_TASK_ID"))  # for SLURM
+        job_id = os.getenv("SLURM_ARRAY_TASK_ID")  # for SLURM
+    job_id = int(job_id)
     TASK_id = job_id - 1  # TASK_ID starts from 1 not 0
     if TASK_id >= len(TASKS):
         print("TASK_id out of TASKS")
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 4674f59..f05b4a4 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -279,9 +279,10 @@ def run_task_paradigm_clustering(
         traceback.print_exc()
     print("Task features extraction finished.")
 
-    job_id = int(os.getenv("SGE_TASK_ID"))  # for SGE
+    job_id = os.getenv("SGE_TASK_ID")  # for SGE
     if job_id is None:
-        job_id = int(os.getenv("SLURM_ARRAY_TASK_ID"))  # for SLURM
+        job_id = os.getenv("SLURM_ARRAY_TASK_ID")  # for SLURM
+    job_id = int(job_id)
     dFC_id = job_id - 1  # TASK_ID starts from 1 not 0
 
     print(f"Task presence classification started for dFC ID {dFC_id}...")

From 8e47e2b0b9f88158bbb1058d1bb5685db42e7a1f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Sep 2024 15:28:50 -0400
Subject: [PATCH 125/401] add dataset info to simulator

---
 simul_dFC/run_scripts_sge/run_simulator.sh   |  7 +++-
 simul_dFC/run_scripts_slurm/run_simulator.sh |  5 ++-
 simul_dFC/task_data_simulator.py             | 34 ++++++++++++++++----
 task_dFC/nifti_to_roi_signal.py              |  1 -
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/simul_dFC/run_scripts_sge/run_simulator.sh b/simul_dFC/run_scripts_sge/run_simulator.sh
index e7f6394..6176236 100644
--- a/simul_dFC/run_scripts_sge/run_simulator.sh
+++ b/simul_dFC/run_scripts_sge/run_simulator.sh
@@ -8,7 +8,12 @@
 #$ -l h_vmem=8G
 #$ -t 1:200
 
+DATASET_INFO="./dataset_info.json"
+
 source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
 conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py"
+
+python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py" \
+--dataset_info $DATASET_INFO
+
 conda deactivate
diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
index f7f8998..21e669e 100644
--- a/simul_dFC/run_scripts_slurm/run_simulator.sh
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -8,11 +8,14 @@
 #SBATCH --mem=8G                     # Memory request per node
 #SBATCH --array=1-200                # Task array specification
 
+DATASET_INFO="./dataset_info.json"
+
 # Activate  virtual environment
 source "/home/mt00/venvs/pydfc/bin/activate"
 
 # Run Python script
-python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py"
+python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" \
+--dataset_info $DATASET_INFO
 
 # Deactivate environment
 deactivate
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 5b081da..15d43d7 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -4,6 +4,8 @@
 
 @author: mte
 """
+import argparse
+import json
 import os
 import warnings
 
@@ -19,12 +21,6 @@
 os.environ["OMP_NUM_THREADS"] = "16"
 ################################# Parameters ####################################
 
-# data paths
-dataset = "ds000002"
-# main_root = f"./DATA/{dataset}" # for local
-main_root = f"/data/origami/dFC/DATA/task-based/simulated/{dataset}"  # for server
-output_root = f"{main_root}/derivatives/ROI_timeseries"
-
 # simulation parameters
 sim_length = 250e3  # in m sec
 onset_time = 20.0  # in seconds
@@ -36,6 +32,32 @@
 dt = 0.5  # integration step
 n_subj = 200  # number of subjects
 
+# argparse
+HELPTEXT = """
+Script to simulate task-based data.
+"""
+parser = argparse.ArgumentParser(description=HELPTEXT)
+
+parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+
+args = parser.parse_args()
+
+dataset_info_file = args.dataset_info
+
+# Read dataset info
+with open(dataset_info_file, "r") as f:
+    dataset_info = json.load(f)
+
+if "{dataset}" in dataset_info["main_root"]:
+    main_root = dataset_info["main_root"].replace("{dataset}", dataset_info["dataset"])
+else:
+    main_root = dataset_info["main_root"]
+
+if "{main_root}" in dataset_info["roi_root"]:
+    output_root = dataset_info["roi_root"].replace("{main_root}", main_root)
+else:
+    output_root = dataset_info["roi_root"]
+
 # create a subject id list
 subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)]
 
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 56880e4..e8d7aa1 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -2,7 +2,6 @@
 import json
 import os
 import warnings
-from re import A
 
 import numpy as np
 

From 5c2ecc74013936a781532e3462cb5e7ffffa6661 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 23 Sep 2024 23:40:53 -0400
Subject: [PATCH 126/401] minor fix

---
 task_dFC/run_scripts_slurm/run_dFC.sh          | 2 +-
 task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
index 84edbb9..8c33edd 100644
--- a/task_dFC/run_scripts_slurm/run_dFC.sh
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -12,7 +12,7 @@ DATASET_INFO="./dataset_info.json"
 
 echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 
-SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
 # Activate  virtual environment
diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 0462e86..419efc7 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -12,7 +12,7 @@ DATASET_INFO="./dataset_info.json"
 
 echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 
-SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
 # Activate  virtual environment

From 31e845a46401844838dc845b9307bb94e9344419 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 24 Sep 2024 14:16:59 -0400
Subject: [PATCH 127/401] dfc embed error handle

---
 pydfc/ml_utils.py | 133 +++++++++++++++++++++++++---------------------
 1 file changed, 72 insertions(+), 61 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index b3d9f9f..32ded52 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -457,7 +457,7 @@ def generalized_procrustes(X_list):
         except:
             continue
 
-    raise ValueError("Generalized Procrustes Analysis did not converge.")
+    raise RuntimeError("Generalized Procrustes Analysis did not converge.")
 
 
 def twonn(X, discard_ratio=0.1):
@@ -523,20 +523,23 @@ def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125):
 
     SI_score = {}
     for n_components in search_range:
-        X_train_embed, _ = embed_dFC_features(
-            train_subjects=["subj"],
-            test_subjects=[],
-            X_train=X,
-            X_test=None,
-            y_train=y,
-            y_test=None,
-            subj_label_train=np.array(["subj"] * len(y)),
-            subj_label_test=None,
-            embedding="LE",
-            n_components=n_components,
-            n_neighbors_LE=n_neighbors_LE,
-            LE_embedding_method="embed+procrustes",
-        )
+        try:
+            X_train_embed, _ = embed_dFC_features(
+                train_subjects=["subj"],
+                test_subjects=[],
+                X_train=X,
+                X_test=None,
+                y_train=y,
+                y_test=None,
+                subj_label_train=np.array(["subj"] * len(y)),
+                subj_label_test=None,
+                embedding="LE",
+                n_components=n_components,
+                n_neighbors_LE=n_neighbors_LE,
+                LE_embedding_method="embed+procrustes",
+            )
+        except:
+            continue
 
         SI_score[n_components] = silhouette_score(X_train_embed, y)
 
@@ -1082,7 +1085,7 @@ def task_presence_classification(
         )
     )
 
-    ML_RESULT = {}
+    ML_RESULT = {"PCA": {}, "LE": {}}
     ML_scores = {
         "subj_id": list(),
         "group": list(),
@@ -1097,20 +1100,23 @@ def task_presence_classification(
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features
-        X_train_embedded, X_test_embedded = embed_dFC_features(
-            train_subjects=train_subjects,
-            test_subjects=test_subjects,
-            X_train=X_train,
-            X_test=X_test,
-            y_train=y_train,
-            y_test=y_test,
-            subj_label_train=subj_label_train,
-            subj_label_test=subj_label_test,
-            embedding=embedding,
-            n_components="auto",
-            n_neighbors_LE=125,
-            LE_embedding_method="embed+procrustes",
-        )
+        try:
+            X_train_embedded, X_test_embedded = embed_dFC_features(
+                train_subjects=train_subjects,
+                test_subjects=test_subjects,
+                X_train=X_train,
+                X_test=X_test,
+                y_train=y_train,
+                y_test=y_test,
+                subj_label_train=subj_label_train,
+                subj_label_test=subj_label_test,
+                embedding=embedding,
+                n_components="auto",
+                n_neighbors_LE=125,
+                LE_embedding_method="embed+procrustes",
+            )
+        except:
+            continue
 
         # task presence classification
 
@@ -1134,7 +1140,6 @@ def task_presence_classification(
         #     X_train_embedded, y_train, X_test_embedded, y_test
         # )
 
-        ML_RESULT[embedding] = {}
         for key in log_reg_RESULT:
             ML_RESULT[embedding][key] = log_reg_RESULT[key]
         for key in KNN_RESULT:
@@ -1224,7 +1229,7 @@ def task_presence_clustering(
         normalize_dFC=normalize_dFC,
     )
 
-    clustering_RESULTS = {}
+    clustering_RESULTS = {"PCA": {}, "LE": {}}
     clustering_scores = {
         "subj_id": list(),
         "task": list(),
@@ -1236,20 +1241,23 @@ def task_presence_clustering(
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features
-        X_embedded, _ = embed_dFC_features(
-            train_subjects=SUBJECTS,
-            test_subjects=[],
-            X_train=X,
-            X_test=None,
-            y_train=y,
-            y_test=None,
-            subj_label_train=subj_label,
-            subj_label_test=None,
-            embedding=embedding,
-            n_components="auto",
-            n_neighbors_LE=125,
-            LE_embedding_method="embed+procrustes",
-        )
+        try:
+            X_embedded, _ = embed_dFC_features(
+                train_subjects=SUBJECTS,
+                test_subjects=[],
+                X_train=X,
+                X_test=None,
+                y_train=y,
+                y_test=None,
+                subj_label_train=subj_label,
+                subj_label_test=None,
+                embedding=embedding,
+                n_components="auto",
+                n_neighbors_LE=125,
+                LE_embedding_method="embed+procrustes",
+            )
+        except:
+            continue
 
         # clustering
         # apply kmeans clustering to dFC features
@@ -1369,23 +1377,26 @@ def task_paradigm_clustering(
     y = y[idx]
     subj_label = subj_label[idx]
 
-    task_paradigm_clstr_RESULTS = {}
+    task_paradigm_clstr_RESULTS = {"PCA": {}, "LE": {}}
     for embedding in ["PCA", "LE"]:
         # embed dFC features
-        X_embed, _ = embed_dFC_features(
-            train_subjects=SUBJECTS,
-            test_subjects=[],
-            X_train=X,
-            X_test=None,
-            y_train=y,
-            y_test=None,
-            subj_label_train=subj_label,
-            subj_label_test=None,
-            embedding=embedding,
-            n_components="auto",
-            n_neighbors_LE=125,
-            LE_embedding_method="embed+procrustes",
-        )
+        try:
+            X_embed, _ = embed_dFC_features(
+                train_subjects=SUBJECTS,
+                test_subjects=[],
+                X_train=X,
+                X_test=None,
+                y_train=y,
+                y_test=None,
+                subj_label_train=subj_label,
+                subj_label_test=None,
+                embedding=embedding,
+                n_components="auto",
+                n_neighbors_LE=125,
+                LE_embedding_method="embed+procrustes",
+            )
+        except:
+            continue
 
         # clustering
         # apply kmeans clustering to dFC features

From 427c3c694ccb9bfdfeaf28ab49dd29a8f47789b1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 24 Sep 2024 16:44:05 -0400
Subject: [PATCH 128/401] minor change

---
 task_dFC/ML.py | 99 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 40 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index f05b4a4..c965319 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -79,22 +79,28 @@ def run_classification(
         for task_id, task in enumerate(TASKS):
             ML_RESULT[task] = {}
             for run in RUNS[task]:
-                ML_RESULT_new, ML_scores_new = task_presence_classification(
-                    task=task,
-                    dFC_id=dFC_id,
-                    roi_root=roi_root,
-                    dFC_root=dFC_root,
-                    run=run,
-                    session=session,
-                    dynamic_pred=dynamic_pred,
-                    normalize_dFC=normalize_dFC,
-                )
-                if run is None:
-                    ML_RESULT[task] = ML_RESULT_new
-                else:
-                    ML_RESULT[task][run] = ML_RESULT_new
-                for key in ML_scores:
-                    ML_scores[key].extend(ML_scores_new[key])
+                try:
+                    ML_RESULT_new, ML_scores_new = task_presence_classification(
+                        task=task,
+                        dFC_id=dFC_id,
+                        roi_root=roi_root,
+                        dFC_root=dFC_root,
+                        run=run,
+                        session=session,
+                        dynamic_pred=dynamic_pred,
+                        normalize_dFC=normalize_dFC,
+                    )
+                    if run is None:
+                        ML_RESULT[task] = ML_RESULT_new
+                    else:
+                        ML_RESULT[task][run] = ML_RESULT_new
+                    for key in ML_scores:
+                        ML_scores[key].extend(ML_scores_new[key])
+                except Exception as e:
+                    print(
+                        f"Error in task presence classification for {session} {task} {run}: {e}"
+                    )
+                    traceback.print_exc()
 
         if session is None:
             folder = f"{output_root}"
@@ -137,21 +143,29 @@ def run_clustering(
         for task_id, task in enumerate(TASKS):
             clustering_RESULTS[task] = {}
             for run in RUNS[task]:
-                clustering_RESULTS_new, clustering_scores_new = task_presence_clustering(
-                    task=task,
-                    dFC_id=dFC_id,
-                    roi_root=roi_root,
-                    dFC_root=dFC_root,
-                    run=run,
-                    session=session,
-                    normalize_dFC=normalize_dFC,
-                )
-                if run is None:
-                    clustering_RESULTS[task] = clustering_RESULTS_new
-                else:
-                    clustering_RESULTS[task][run] = clustering_RESULTS_new
-                for key in clustering_scores:
-                    clustering_scores[key].extend(clustering_scores_new[key])
+                try:
+                    clustering_RESULTS_new, clustering_scores_new = (
+                        task_presence_clustering(
+                            task=task,
+                            dFC_id=dFC_id,
+                            roi_root=roi_root,
+                            dFC_root=dFC_root,
+                            run=run,
+                            session=session,
+                            normalize_dFC=normalize_dFC,
+                        )
+                    )
+                    if run is None:
+                        clustering_RESULTS[task] = clustering_RESULTS_new
+                    else:
+                        clustering_RESULTS[task][run] = clustering_RESULTS_new
+                    for key in clustering_scores:
+                        clustering_scores[key].extend(clustering_scores_new[key])
+                except Exception as e:
+                    print(
+                        f"Error in task presence clustering for {session} {task} {run}: {e}"
+                    )
+                    traceback.print_exc()
 
         if session is None:
             folder = f"{output_root}"
@@ -179,15 +193,20 @@ def run_task_paradigm_clustering(
 ):
     for session in SESSIONS:
 
-        task_paradigm_clstr_RESULTS = task_paradigm_clustering(
-            dFC_id=dFC_id,
-            TASKS=TASKS,
-            RUNS=RUNS,
-            session=session,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            normalize_dFC=normalize_dFC,
-        )
+        try:
+            task_paradigm_clstr_RESULTS = task_paradigm_clustering(
+                dFC_id=dFC_id,
+                TASKS=TASKS,
+                RUNS=RUNS,
+                session=session,
+                roi_root=roi_root,
+                dFC_root=dFC_root,
+                normalize_dFC=normalize_dFC,
+            )
+        except Exception as e:
+            print(f"Error in task paradigm clustering for {session}: {e}")
+            traceback.print_exc()
+            continue
 
         if session is None:
             folder = f"{output_root}"

From 943cd7be6aed1ce42fcaa6451303c9e32a2fc4c3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 3 Oct 2024 10:24:46 -0400
Subject: [PATCH 129/401] slurm change

---
 task_dFC/run_scripts_slurm/global_config.json | 4 ++--
 task_dFC/run_scripts_slurm/run_ML.sh          | 2 +-
 task_dFC/run_scripts_slurm/run_fmriprep.sh    | 6 +++++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/global_config.json b/task_dFC/run_scripts_slurm/global_config.json
index 0e0681c..a99d2d7 100644
--- a/task_dFC/run_scripts_slurm/global_config.json
+++ b/task_dFC/run_scripts_slurm/global_config.json
@@ -3,10 +3,10 @@
     "VISIT_IDS": [],
     "SESSION_IDS": [],
     "SUBSTITUTIONS": {
-        "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy/",
+        "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy",
         "[[HEUDICONV_HEURISTIC_FILE]]": "",
         "[[DCM2BIDS_CONFIG_FILE]]": "",
-        "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/",
+        "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/license.txt",
         "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow"
     },
     "DICOM_DIR_PARTICIPANT_FIRST": true,
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index ff4a52b..4b166fd 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -5,7 +5,7 @@
 #SBATCH --error=logs/ML_err.txt   # Standard error log
 #SBATCH --account=def-jbpoline           # Account
 #SBATCH --time=72:00:00                # Walltime for each task (72 hours)
-#SBATCH --mem=64G                     # Memory request per node
+#SBATCH --mem=70G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 
diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index ed9306f..3bfd8a3 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -4,8 +4,12 @@
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
 #SBATCH --time=24:00:00                # Walltime (24 hours)
-#SBATCH --mem=32G                      # Memory (32 GB)
+#SBATCH --mem=64G                      # Memory (64 GB)
+#SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=def-jbpoline           # Account
+#SBATCH --tmp=100G                     # Allocate 100GB of temporary space
+
+module load apptainer
 
 source "/home/mt00/venvs/nipoppy_env/bin/activate"
 

From 8292a676ba3739b2f00150910c2a47857b456bd3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 11 Oct 2024 13:06:09 -0400
Subject: [PATCH 130/401] minor change

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 3bfd8a3..5e60266 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fmriprep_job       # Name of the job
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
-#SBATCH --time=24:00:00                # Walltime (24 hours)
+#SBATCH --time=72:00:00                # Walltime (72 hours)
 #SBATCH --mem=64G                      # Memory (64 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=def-jbpoline           # Account

From b5ea93d867d2ae21cea915b26785adc70bc66c2a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 17 Oct 2024 01:00:51 -0400
Subject: [PATCH 131/401] minor change

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 5e60266..e78a88c 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fmriprep_job       # Name of the job
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
-#SBATCH --time=72:00:00                # Walltime (72 hours)
+#SBATCH --time=10-00:00:00                # Walltime (10 days)
 #SBATCH --mem=64G                      # Memory (64 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=def-jbpoline           # Account

From e21701ed85d6d994d9dce307f506ac7cce907a71 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 17 Oct 2024 01:02:37 -0400
Subject: [PATCH 132/401] minor change

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index e78a88c..a2721b7 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fmriprep_job       # Name of the job
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
-#SBATCH --time=10-00:00:00                # Walltime (10 days)
+#SBATCH --time=7-00:00:00                # Walltime (7 days)
 #SBATCH --mem=64G                      # Memory (64 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=def-jbpoline           # Account

From ca0cbacbcae4cec8b8531cc92831758491081347 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 22 Oct 2024 16:59:44 -0400
Subject: [PATCH 133/401] minor change

---
 task_dFC/run_scripts_slurm/run_FCS.sh          | 2 +-
 task_dFC/run_scripts_slurm/run_ML.sh           | 2 +-
 task_dFC/run_scripts_slurm/run_dFC.sh          | 2 +-
 task_dFC/run_scripts_slurm/run_fmriprep.sh     | 2 +-
 task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 2 +-
 task_dFC/run_scripts_slurm/run_report.sh       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index b4d3b52..7ef0058 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fit_fcs_job   # Optional: Name of your job
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=96:00:00                # Walltime for each task (96 hours)
 #SBATCH --mem=64G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index 4b166fd..da8c6cc 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=ML_job   # Optional: Name of your job
 #SBATCH --output=logs/ML_out.txt  # Standard output log
 #SBATCH --error=logs/ML_err.txt   # Standard error log
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=72:00:00                # Walltime for each task (72 hours)
 #SBATCH --mem=70G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
index 8c33edd..e329fd0 100644
--- a/task_dFC/run_scripts_slurm/run_dFC.sh
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=assess_dfc_job   # Optional: Name of your job
 #SBATCH --output=logs/dfc_out.txt  # Standard output log
 #SBATCH --error=logs/dfc_err.txt   # Standard error log
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=32G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index a2721b7..7183c19 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -6,7 +6,7 @@
 #SBATCH --time=7-00:00:00                # Walltime (7 days)
 #SBATCH --mem=64G                      # Memory (64 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --tmp=100G                     # Allocate 100GB of temporary space
 
 module load apptainer
diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 419efc7..36ada93 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=extract_roi_job   # Optional: Name of your job
 #SBATCH --output=logs/roi_out.txt  # Standard output log
 #SBATCH --error=logs/roi_err.txt   # Standard error log
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=32G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
index 11167d8..57b6634 100644
--- a/task_dFC/run_scripts_slurm/run_report.sh
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=report_job   # Optional: Name of your job
 #SBATCH --output=logs/report_out.txt  # Standard output log
 #SBATCH --error=logs/report_err.txt   # Standard error log
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=16G                     # Memory request per node
 

From 98dd86cf9bb1839a12b576db8ee0fc35489f29f9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 29 Oct 2024 13:47:09 -0400
Subject: [PATCH 134/401] minor

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 7183c19..de517f4 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -7,7 +7,6 @@
 #SBATCH --mem=64G                      # Memory (64 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=rrg-jbpoline           # Account
-#SBATCH --tmp=100G                     # Allocate 100GB of temporary space
 
 module load apptainer
 

From 5f4858abafdc11229c98d472c0d85d70914be27c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 4 Nov 2024 15:14:42 -0500
Subject: [PATCH 135/401] modify run_fmriprep

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index de517f4..130ca58 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -4,7 +4,7 @@
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
 #SBATCH --time=7-00:00:00                # Walltime (7 days)
-#SBATCH --mem=64G                      # Memory (64 GB)
+#SBATCH --mem=32G                      # Memory (32 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=rrg-jbpoline           # Account
 
@@ -20,8 +20,8 @@ SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST)
 echo "Subject ID: $SUBJECT_ID"
 
 nipoppy run \
+"$(dirname "$(pwd)")" \
 --pipeline fmriprep \
---dataset-root "$(dirname "$(pwd)")" \
 --participant-id $SUBJECT_ID
 
 deactivate

From 1ab15c46e7d9d0d25182011656c2a6a0a20196fb Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 5 Nov 2024 11:55:21 -0500
Subject: [PATCH 136/401] eigen_solver

---
 pydfc/ml_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 32ded52..c53d8b9 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -620,7 +620,7 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
         n_components=n_components,
         affinity="precomputed",
         n_neighbors=n_neighbors_to_be_used,
-        # eigen_solver="lobpcg",
+        eigen_solver="lobpcg",
     )
     X_embed = LE.fit_transform(X=affinity_matrix)
     return X_embed

From 752aacc35444c2aab2bc34bff8e979ec5541b100 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 11 Nov 2024 14:23:01 -0500
Subject: [PATCH 137/401] change confound strategy to simple

---
 pydfc/data_loader.py            | 12 +++++++++++-
 task_dFC/nifti_to_roi_signal.py |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pydfc/data_loader.py b/pydfc/data_loader.py
index fba1ced..7f65959 100644
--- a/pydfc/data_loader.py
+++ b/pydfc/data_loader.py
@@ -167,9 +167,12 @@ def nifti2array(nifti_file, confound_strategy="none", standardize=False, n_rois=
         'no_motion_no_gsr': motion parameters are used
                             and global signal regression
                             is applied.
+        'simple': nilearn's simple preprocessing with
+                            full motion and basic wm_csf
+                            and high_pass
     """
     from nilearn import datasets
-    from nilearn.interfaces.fmriprep import load_confounds
+    from nilearn.interfaces.fmriprep import load_confounds, load_confounds_strategy
     from nilearn.maskers import NiftiLabelsMasker
     from nilearn.plotting import find_parcellation_cut_coords
 
@@ -223,6 +226,13 @@ def nifti2array(nifti_file, confound_strategy="none", standardize=False, n_rois=
         time_series = masker.fit_transform(
             nifti_file, confounds=confounds_simple, sample_mask=sample_mask
         )
+    elif confound_strategy == "simple":
+        confounds_simple, sample_mask = load_confounds_strategy(
+            nifti_file, denoise_strategy="simple"
+        )
+        time_series = masker.fit_transform(
+            nifti_file, confounds=confounds_simple, sample_mask=sample_mask
+        )
 
     return time_series, labels, locs
 
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index e8d7aa1..00f010b 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -97,7 +97,7 @@ def run_roi_signal_extraction(
             n_rois=100,
             Fs=1 / TR_mri,
             subj_id=subj,
-            confound_strategy="no_motion",
+            confound_strategy="simple",
             standardize="zscore",
             TS_name="BOLD",
             session=task,

From 3468a54472a4564ec574e1695f98babf2ff53bb7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 11 Nov 2024 17:18:06 -0500
Subject: [PATCH 138/401] change fmriprep root for slurm

---
 task_dFC/nifti_to_roi_signal.py              | 4 ++++
 task_dFC/run_scripts_slurm/dataset_info.json | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 00f010b..5d81b6d 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -241,6 +241,10 @@ def run_roi_signal_extraction(
 
     if "{main_root}" in dataset_info["fmriprep_root"]:
         fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
+    elif "{dataset}" in dataset_info["fmriprep_root"]:
+        fmriprep_root = dataset_info["fmriprep_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
     else:
         fmriprep_root = dataset_info["fmriprep_root"]
 
diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json
index e466511..bc39b0d 100644
--- a/task_dFC/run_scripts_slurm/dataset_info.json
+++ b/task_dFC/run_scripts_slurm/dataset_info.json
@@ -1,7 +1,7 @@
 {
 	"dataset" : "",
 	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}",
-	"fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output",
+	"fmriprep_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/derivatives/fmriprep/23.1.3/output",
 	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
 	"dFC_root" : "{main_root}/derivatives/dFC_assessed",

From 0df68ad830a935bdb9aa36135bd75544ffeea3c2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 11 Nov 2024 23:47:56 -0500
Subject: [PATCH 139/401] add bids_root

---
 task_dFC/nifti_to_roi_signal.py              | 29 +++++++++++++-------
 task_dFC/run_scripts_sge/dataset_info.json   |  1 +
 task_dFC/run_scripts_slurm/dataset_info.json |  1 +
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 5d81b6d..e4216a4 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -14,7 +14,7 @@
 def run_roi_signal_extraction(
     subj,
     task,
-    main_root,
+    bids_root,
     fmriprep_root,
     bold_suffix,
     output_root,
@@ -59,22 +59,22 @@ def run_roi_signal_extraction(
             task_file = [file_i for file_i in ALL_TASK_FILES if f"_{run}_" in file_i][0]
         if session is None:
             nifti_file = f"{fmriprep_root}/{subj}/func/{task_file}"
-            task_events_root = f"{main_root}/bids/{subj}/func"
+            task_events_root = f"{bids_root}/{subj}/func"
         else:
             nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}"
-            task_events_root = f"{main_root}/bids/{subj}/{session}/func"
+            task_events_root = f"{bids_root}/{subj}/{session}/func"
         info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}"
 
-        # in some cases the info file is common for all subjects and can be found in f"{main_root}/bids"
+        # in some cases the info file is common for all subjects and can be found in f"{bids_root}"
         if not os.path.exists(info_file):
-            ALL_COMMON_FILES = os.listdir(f"{main_root}/bids/")
+            ALL_COMMON_FILES = os.listdir(f"{bids_root}/")
             ALL_COMMON_FILES = [
                 file_i
                 for file_i in ALL_COMMON_FILES
                 if (f"{task}_" in file_i) and ("_bold.json" in file_i)
             ]
             if len(ALL_COMMON_FILES) == 1:
-                info_file = f"{main_root}/bids/{ALL_COMMON_FILES[0]}"
+                info_file = f"{bids_root}/{ALL_COMMON_FILES[0]}"
         if not os.path.exists(info_file):
             # if the info file is not found, exclude the subject
             if run is None:
@@ -124,15 +124,15 @@ def run_roi_signal_extraction(
             ]
 
         if not len(ALL_EVENTS_FILES) == 1:
-            # in some cases the event file is common for all subjects and can be found in f"{main_root}/bids"
-            ALL_EVENTS_FILES_COMMON = os.listdir(f"{main_root}/bids/")
+            # in some cases the event file is common for all subjects and can be found in f"{bids_root}"
+            ALL_EVENTS_FILES_COMMON = os.listdir(f"{bids_root}/")
             ALL_EVENTS_FILES_COMMON = [
                 file_i
                 for file_i in ALL_EVENTS_FILES_COMMON
                 if (f"{task}_" in file_i) and ("events.tsv" in file_i)
             ]
             if len(ALL_EVENTS_FILES_COMMON) == 1:
-                events_file = f"{main_root}/bids/{ALL_EVENTS_FILES_COMMON[0]}"
+                events_file = f"{bids_root}/{ALL_EVENTS_FILES_COMMON[0]}"
             else:
                 # if the events file is not found, exclude the subject
                 if run is None:
@@ -239,6 +239,15 @@ def run_roi_signal_extraction(
     else:
         main_root = dataset_info["main_root"]
 
+    if "{main_root}" in dataset_info["bids_root"]:
+        bids_root = dataset_info["bids_root"].replace("{main_root}", main_root)
+    elif "{dataset}" in dataset_info["bids_root"]:
+        bids_root = dataset_info["bids_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        bids_root = dataset_info["bids_root"]
+
     if "{main_root}" in dataset_info["fmriprep_root"]:
         fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
     elif "{dataset}" in dataset_info["fmriprep_root"]:
@@ -261,7 +270,7 @@ def run_roi_signal_extraction(
             run_roi_signal_extraction(
                 subj=participant_id,
                 task=task,
-                main_root=main_root,
+                bids_root=bids_root,
                 fmriprep_root=fmriprep_root,
                 bold_suffix=dataset_info["bold_suffix"],
                 output_root=output_root,
diff --git a/task_dFC/run_scripts_sge/dataset_info.json b/task_dFC/run_scripts_sge/dataset_info.json
index 16d775e..30531e6 100644
--- a/task_dFC/run_scripts_sge/dataset_info.json
+++ b/task_dFC/run_scripts_sge/dataset_info.json
@@ -1,6 +1,7 @@
 {
 	"dataset" : "",
 	"main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}",
+	"bids_root" : "{main_root}/bids",
 	"fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output",
 	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json
index bc39b0d..74f4ddf 100644
--- a/task_dFC/run_scripts_slurm/dataset_info.json
+++ b/task_dFC/run_scripts_slurm/dataset_info.json
@@ -1,6 +1,7 @@
 {
 	"dataset" : "",
 	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}",
+	"bids_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/bids",
 	"fmriprep_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/derivatives/fmriprep/23.1.3/output",
 	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",

From 8bd3914b401568907be8825f5fc3578a7be5d578 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 12 Nov 2024 13:39:10 -0500
Subject: [PATCH 140/401] minor change

---
 task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 36ada93..6d1d88d 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -5,7 +5,7 @@
 #SBATCH --error=logs/roi_err.txt   # Standard error log
 #SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
-#SBATCH --mem=32G                     # Memory request per node
+#SBATCH --mem=64G                     # Memory request per node
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"

From 1b99922e31dc857a50d977574e5b1925311e487c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 17 Nov 2024 21:56:15 -0500
Subject: [PATCH 141/401] minor fix

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 130ca58..8fecaea 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -4,7 +4,7 @@
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
 #SBATCH --time=7-00:00:00                # Walltime (7 days)
-#SBATCH --mem=32G                      # Memory (32 GB)
+#SBATCH --mem-per-cpu=16G                      # Memory (32 GB)
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=rrg-jbpoline           # Account
 

From f5188d58a20f7d56280c93e3d024ef99879ad871 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 19 Nov 2024 11:10:06 -0500
Subject: [PATCH 142/401] slurm change

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 8fecaea..0abfdbc 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -3,8 +3,8 @@
 #SBATCH --job-name=fmriprep_job       # Name of the job
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
-#SBATCH --time=7-00:00:00                # Walltime (7 days)
-#SBATCH --mem-per-cpu=16G                      # Memory (32 GB)
+#SBATCH --time=2-00:00:00                # Walltime (2 day)
+#SBATCH --mem-per-cpu=16G                # Memory (16 GB) per cpu
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 #SBATCH --account=rrg-jbpoline           # Account
 

From 809389308793e4f12b318a828214f666eaefa5e8 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 21 Nov 2024 15:12:13 -0500
Subject: [PATCH 143/401] add invo and desc for nipoppy

---
 task_dFC/run_scripts_slurm/descriptor.json    | 668 ++++++++++++++++++
 task_dFC/run_scripts_slurm/global_config.json |  61 +-
 task_dFC/run_scripts_slurm/invocation.json    |  24 +
 3 files changed, 728 insertions(+), 25 deletions(-)
 create mode 100644 task_dFC/run_scripts_slurm/descriptor.json
 create mode 100644 task_dFC/run_scripts_slurm/invocation.json

diff --git a/task_dFC/run_scripts_slurm/descriptor.json b/task_dFC/run_scripts_slurm/descriptor.json
new file mode 100644
index 0000000..f039583
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/descriptor.json
@@ -0,0 +1,668 @@
+{
+    "name": "fmriprep",
+    "description": "fmriprep",
+    "tool-version": "23.1.3",
+    "schema-version": "0.5",
+    "command-line": "[[NIPOPPY_CONTAINER_COMMAND]] --bind $SLURM_TMPDIR:/work [[NIPOPPY_FPATH_CONTAINER]] [BIDS_DIR] [OUTPUT_DIR] [ANALYSIS_LEVEL] [SKIP_BIDS_VALIDATION] [PARTICIPANT_LABEL] [TASK_ID] [ECHO_IDX] [BIDS_FILTERS] [ANAT_DERIVATIVES] [BIDS_DATABASE_DIR] [NPROCS] [OMP_NTHREADS] [MEMORY_GB] [LOW_MEM] [USE_PLUGIN] [SLOPPY] [ANAT_ONLY] [BOILERPLATE_ONLY] [REPORTS_ONLY] [IGNORE] [OUTPUT_SPACES] [LONGITUDINAL] [BOLD2T1W_INIT] [BOLD2T1W_DOF] [USE_BBR] [SLICE_TIME_REF] [DUMMY_SCANS] [_RANDOM_SEED] [ME_T2S_FIT_METHOD] [OUTPUT_LAYOUT] [ME_OUTPUT_ECHOS] [MEDIAL_SURFACE_NAN] [PROJECT_GOODVOXELS] [MD_ONLY_BOILERPLATE] [CIFTI_OUTPUT] [USE_AROMA] [AROMA_MELODIC_DIM] [AROMA_ERR_ON_WARN] [REGRESSORS_ALL_COMPS] [REGRESSORS_FD_TH] [REGRESSORS_DVARS_TH] [SKULL_STRIP_TEMPLATE] [SKULL_STRIP_FIXED_SEED] [SKULL_STRIP_T1W] [FMAP_BSPLINE] [FMAP_NO_DEMEAN] [USE_SYN_SDC] [FORCE_SYN] [FS_LICENSE_FILE] [FS_SUBJECTS_DIR] [HIRES] [SKIP_RECONALL] [TRACK_CARBON] [COUNTRY_CODE] [VERSION] [VERBOSE_COUNT] [WORK_DIR] [CLEAN_WORKDIR] [RESOURCE_MONITOR] [CONFIG_FILE] [WRITE_GRAPH] [STOP_ON_FIRST_CRASH] [NOTRACK] [DEBUG]",
+    "inputs": [
+        {
+            "id": "bids_dir",
+            "name": "bids_dir",
+            "description": "The root folder of a BIDS valid dataset (sub-XXXXX folders should be found at the top level in this folder).",
+            "optional": false,
+            "type": "String",
+            "value-key": "[BIDS_DIR]"
+        },
+        {
+            "id": "output_dir",
+            "name": "output_dir",
+            "description": "The output path for the outcomes of preprocessing and visual reports",
+            "optional": false,
+            "type": "String",
+            "value-key": "[OUTPUT_DIR]"
+        },
+        {
+            "id": "analysis_level",
+            "name": "analysis_level",
+            "description": "Processing stage to be run, only \"participant\" in the case of fMRIPrep (see BIDS-Apps specification).",
+            "optional": false,
+            "type": "String",
+            "value-key": "[ANALYSIS_LEVEL]",
+            "value-choices": [
+                "participant"
+            ]
+        },
+        {
+            "id": "skip_bids_validation",
+            "name": "skip_bids_validation",
+            "description": "Assume the input dataset is BIDS compliant and skip the validation",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[SKIP_BIDS_VALIDATION]",
+            "command-line-flag": "--skip_bids_validation"
+        },
+        {
+            "id": "participant_label",
+            "name": "participant_label",
+            "description": "A space delimited list of participant identifiers or a single identifier (the sub- prefix can be removed)",
+            "optional": true,
+            "type": "String",
+            "value-key": "[PARTICIPANT_LABEL]",
+            "list": true,
+            "command-line-flag": "--participant-label"
+        },
+        {
+            "id": "task_id",
+            "name": "task_id",
+            "description": "Select a specific task to be processed",
+            "optional": true,
+            "type": "String",
+            "value-key": "[TASK_ID]",
+            "command-line-flag": "-t"
+        },
+        {
+            "id": "echo_idx",
+            "name": "echo_idx",
+            "description": "Select a specific echo to be processed in a multiecho series",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[ECHO_IDX]",
+            "command-line-flag": "--echo-idx"
+        },
+        {
+            "id": "bids_filters",
+            "name": "bids_filters",
+            "description": "A JSON file describing custom BIDS input filters using PyBIDS. For further details, please check out https://fmriprep.readthedocs.io/en/0/faq.html#how-do-I-select-only-certain-files-to-be-input-to-fMRIPrep",
+            "optional": true,
+            "type": "String",
+            "value-key": "[BIDS_FILTERS]",
+            "command-line-flag": "--bids-filter-file"
+        },
+        {
+            "id": "anat_derivatives",
+            "name": "anat_derivatives",
+            "description": "Reuse the anatomical derivatives from another fMRIPrep run or calculated with an alternative processing tool (NOT RECOMMENDED).",
+            "optional": true,
+            "type": "String",
+            "value-key": "[ANAT_DERIVATIVES]",
+            "command-line-flag": "--anat-derivatives"
+        },
+        {
+            "id": "bids_database_dir",
+            "name": "bids_database_dir",
+            "description": "Path to a PyBIDS database folder, for faster indexing (especially useful for large datasets). Will be created if not present.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[BIDS_DATABASE_DIR]",
+            "command-line-flag": "--bids-database-dir"
+        },
+        {
+            "id": "nprocs",
+            "name": "nprocs",
+            "description": "Maximum number of threads across all processes",
+            "optional": true,
+            "type": "String",
+            "value-key": "[NPROCS]",
+            "command-line-flag": "--nprocs"
+        },
+        {
+            "id": "omp_nthreads",
+            "name": "omp_nthreads",
+            "description": "Maximum number of threads per-process",
+            "optional": true,
+            "type": "String",
+            "value-key": "[OMP_NTHREADS]",
+            "command-line-flag": "--omp-nthreads"
+        },
+        {
+            "id": "memory_gb",
+            "name": "memory_gb",
+            "description": "Upper bound memory limit for fMRIPrep processes",
+            "optional": true,
+            "type": "String",
+            "value-key": "[MEMORY_GB]",
+            "command-line-flag": "--mem"
+        },
+        {
+            "id": "low_mem",
+            "name": "low_mem",
+            "description": "Attempt to reduce memory usage (will increase disk usage in working directory)",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[LOW_MEM]",
+            "command-line-flag": "--low-mem"
+        },
+        {
+            "id": "use_plugin",
+            "name": "use_plugin",
+            "description": "Nipype plugin configuration file",
+            "optional": true,
+            "type": "String",
+            "value-key": "[USE_PLUGIN]",
+            "command-line-flag": "--use-plugin"
+        },
+        {
+            "id": "sloppy",
+            "name": "sloppy",
+            "description": "Use low-quality tools for speed - TESTING ONLY",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[SLOPPY]",
+            "command-line-flag": "--sloppy"
+        },
+        {
+            "id": "anat_only",
+            "name": "anat_only",
+            "description": "Run anatomical workflows only",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[ANAT_ONLY]",
+            "command-line-flag": "--anat-only"
+        },
+        {
+            "id": "boilerplate_only",
+            "name": "boilerplate_only",
+            "description": "Generate boilerplate only",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[BOILERPLATE_ONLY]",
+            "command-line-flag": "--boilerplate-only"
+        },
+        {
+            "id": "reports_only",
+            "name": "reports_only",
+            "description": "Only generate reports, don't run workflows. This will only rerun report aggregation, not reportlet generation for specific nodes.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[REPORTS_ONLY]",
+            "command-line-flag": "--reports-only"
+        },
+        {
+            "id": "ignore",
+            "name": "ignore",
+            "description": "Ignore selected aspects of the input dataset to disable corresponding parts of the workflow (a space delimited list)",
+            "optional": true,
+            "type": "String",
+            "value-key": "[IGNORE]",
+            "list": true,
+            "value-choices": [
+                "fieldmaps",
+                "slicetiming",
+                "sbref",
+                "t2w",
+                "flair"
+            ],
+            "command-line-flag": "--ignore"
+        },
+        {
+            "id": "output_spaces",
+            "name": "output_spaces",
+            "description": "Standard and non-standard spaces to resample anatomical and functional images to. Standard spaces may be specified by the form ``<SPACE>[:cohort-<label>][:res-<resolution>][...]``, where ``<SPACE>`` is a keyword designating a spatial reference, and may be followed by optional, colon-separated parameters. Non-standard spaces imply specific orientations and sampling grids. Important to note, the ``res-*`` modifier does not define the resolution used for the spatial normalization. To generate no BOLD outputs, use this option without specifying any spatial references. For further details, please check out https://fmriprep.readthedocs.io/en/0/spaces.html",
+            "optional": true,
+            "list": true,
+            "type": "String",
+            "value-key": "[OUTPUT_SPACES]",
+            "command-line-flag": "--output-spaces"
+        },
+        {
+            "id": "longitudinal",
+            "name": "longitudinal",
+            "description": "Treat dataset as longitudinal - may increase runtime",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[LONGITUDINAL]",
+            "command-line-flag": "--longitudinal"
+        },
+        {
+            "id": "bold2t1w_init",
+            "name": "bold2t1w_init",
+            "description": "Either \"register\" (the default) to initialize volumes at center or \"header\" to use the header information when coregistering BOLD to T1w images.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[BOLD2T1W_INIT]",
+            "default-value": "register",
+            "value-choices": [
+                "register",
+                "header"
+            ],
+            "command-line-flag": "--bold2t1w-init"
+        },
+        {
+            "id": "bold2t1w_dof",
+            "name": "bold2t1w_dof",
+            "description": "Degrees of freedom when registering BOLD to T1w images. 6 degrees (rotation and translation) are used by default.",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[BOLD2T1W_DOF]",
+            "default-value": 6,
+            "value-choices": [
+                6,
+                9,
+                12
+            ],
+            "command-line-flag": "--bold2t1w-dof"
+        },
+        {
+            "id": "use_bbr",
+            "name": "use_bbr",
+            "description": "Always use boundary-based registration (no goodness-of-fit checks)",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[USE_BBR]",
+            "command-line-flag": "--force-bbr"
+        },
+        {
+            "id": "slice_time_ref",
+            "name": "slice_time_ref",
+            "description": "The time of the reference slice to correct BOLD values to, as a fraction acquisition time. 0 indicates the start, 0.5 the midpoint, and 1 the end of acquisition. The alias `start` corresponds to 0, and `middle` to 0.5. The default value is 0.5.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[SLICE_TIME_REF]",
+            "command-line-flag": "--slice-time-ref"
+        },
+        {
+            "id": "dummy_scans",
+            "name": "dummy_scans",
+            "description": "Number of nonsteady-state volumes. Overrides automatic detection.",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[DUMMY_SCANS]",
+            "command-line-flag": "--dummy-scans"
+        },
+        {
+            "id": "_random_seed",
+            "name": "_random_seed",
+            "description": "Initialize the random seed for the workflow",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[_RANDOM_SEED]",
+            "command-line-flag": "--random-seed"
+        },
+        {
+            "id": "me_t2s_fit_method",
+            "name": "me_t2s_fit_method",
+            "description": "The method by which to estimate T2* and S0 for multi-echo data. 'curvefit' uses nonlinear regression. It is more memory intensive, but also may be more accurate, than 'loglin'. 'loglin' uses log-linear regression. It is faster and less memory intensive, but may be less accurate.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[ME_T2S_FIT_METHOD]",
+            "default-value": "curvefit",
+            "value-choices": [
+                "curvefit",
+                "loglin"
+            ],
+            "command-line-flag": "--me-t2s-fit-method"
+        },
+        {
+            "id": "output_layout",
+            "name": "output_layout",
+            "description": "Organization of outputs. \"bids\" (default) places fMRIPrep derivatives directly in the output directory, and defaults to placing FreeSurfer derivatives in <output-dir>/sourcedata/freesurfer. \"legacy\" creates derivative datasets as subdirectories of outputs.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[OUTPUT_LAYOUT]",
+            "default-value": "bids",
+            "value-choices": [
+                "bids",
+                "legacy"
+            ],
+            "command-line-flag": "--output-layout"
+        },
+        {
+            "id": "me_output_echos",
+            "name": "me_output_echos",
+            "description": "Output individual echo time series with slice, motion and susceptibility correction. Useful for further Tedana processing post-fMRIPrep.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[ME_OUTPUT_ECHOS]",
+            "command-line-flag": "--me-output-echos"
+        },
+        {
+            "id": "medial_surface_nan",
+            "name": "medial_surface_nan",
+            "description": "Replace medial wall values with NaNs on functional GIFTI files. Only performed for GIFTI files mapped to a freesurfer subject (fsaverage or fsnative).",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[MEDIAL_SURFACE_NAN]",
+            "command-line-flag": "--medial-surface-nan"
+        },
+        {
+            "id": "project_goodvoxels",
+            "name": "project_goodvoxels",
+            "description": "Exclude voxels whose timeseries have locally high coefficient of variation from surface resampling. Only performed for GIFTI files mapped to a freesurfer subject (fsaverage or fsnative).",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[PROJECT_GOODVOXELS]",
+            "command-line-flag": "--project-goodvoxels"
+        },
+        {
+            "id": "md_only_boilerplate",
+            "name": "md_only_boilerplate",
+            "description": "Skip generation of HTML and LaTeX formatted citation with pandoc",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[MD_ONLY_BOILERPLATE]",
+            "command-line-flag": "--md-only-boilerplate"
+        },
+        {
+            "id": "cifti_output",
+            "name": "cifti_output",
+            "description": "Output preprocessed BOLD as a CIFTI dense timeseries. Optionally, the number of grayordinate can be specified (default is 91k, which equates to 2mm resolution)",
+            "optional": true,
+            "type": "String",
+            "value-key": "[CIFTI_OUTPUT]",
+            "value-choices": [
+                "91k",
+                "170k"
+            ],
+            "command-line-flag": "--cifti-output"
+        },
+        {
+            "id": "use_aroma",
+            "name": "use_aroma",
+            "description": "Deprecated. Will raise an error in 24.0.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[USE_AROMA]",
+            "command-line-flag": "--use-aroma"
+        },
+        {
+            "id": "aroma_melodic_dim",
+            "name": "aroma_melodic_dim",
+            "description": "Deprecated. Will raise an error in 24.0.",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[AROMA_MELODIC_DIM]",
+            "command-line-flag": "--aroma-melodic-dimensionality"
+        },
+        {
+            "id": "aroma_err_on_warn",
+            "name": "aroma_err_on_warn",
+            "description": "Deprecated. Will raise an error in 24.0.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[AROMA_ERR_ON_WARN]",
+            "command-line-flag": "--error-on-aroma-warnings"
+        },
+        {
+            "id": "regressors_all_comps",
+            "name": "regressors_all_comps",
+            "description": "Include all components estimated in CompCor decomposition in the confounds file instead of only the components sufficient to explain 50 percent of BOLD variance in each CompCor mask",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[REGRESSORS_ALL_COMPS]",
+            "command-line-flag": "--return-all-components"
+        },
+        {
+            "id": "regressors_fd_th",
+            "name": "regressors_fd_th",
+            "description": "Threshold for flagging a frame as an outlier on the basis of framewise displacement",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[REGRESSORS_FD_TH]",
+            "default-value": 0.5,
+            "command-line-flag": "--fd-spike-threshold"
+        },
+        {
+            "id": "regressors_dvars_th",
+            "name": "regressors_dvars_th",
+            "description": "Threshold for flagging a frame as an outlier on the basis of standardised DVARS",
+            "optional": true,
+            "type": "Number",
+            "value-key": "[REGRESSORS_DVARS_TH]",
+            "default-value": 1.5,
+            "command-line-flag": "--dvars-spike-threshold"
+        },
+        {
+            "id": "skull_strip_template",
+            "name": "skull_strip_template",
+            "description": "Select a template for skull-stripping with antsBrainExtraction (OASIS30ANTs, by default)",
+            "optional": true,
+            "type": "String",
+            "value-key": "[SKULL_STRIP_TEMPLATE]",
+            "default-value": "OASIS30ANTs",
+            "command-line-flag": "--skull-strip-template"
+        },
+        {
+            "id": "skull_strip_fixed_seed",
+            "name": "skull_strip_fixed_seed",
+            "description": "Do not use a random seed for skull-stripping - will ensure run-to-run replicability when used with --omp-nthreads 1 and matching --random-seed <int>",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[SKULL_STRIP_FIXED_SEED]",
+            "command-line-flag": "--skull-strip-fixed-seed"
+        },
+        {
+            "id": "skull_strip_t1w",
+            "name": "skull_strip_t1w",
+            "description": "Perform T1-weighted skull stripping ('force' ensures skull stripping, 'skip' ignores skull stripping, and 'auto' applies brain extraction based on the outcome of a heuristic to check whether the brain is already masked).",
+            "optional": true,
+            "type": "String",
+            "value-key": "[SKULL_STRIP_T1W]",
+            "default-value": "force",
+            "value-choices": [
+                "auto",
+                "skip",
+                "force"
+            ],
+            "command-line-flag": "--skull-strip-t1w"
+        },
+        {
+            "id": "fmap_bspline",
+            "name": "fmap_bspline",
+            "description": "Fit a B-Spline field using least-squares (experimental)",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[FMAP_BSPLINE]",
+            "command-line-flag": "--fmap-bspline"
+        },
+        {
+            "id": "fmap_no_demean",
+            "name": "fmap_no_demean",
+            "description": "Do not remove median (within mask) from fieldmap",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[FMAP_NO_DEMEAN]",
+            "command-line-flag": "--fmap-no-demean"
+        },
+        {
+            "id": "use_syn_sdc",
+            "name": "use_syn_sdc",
+            "description": "Use fieldmap-less distortion correction based on anatomical image; if unable, error (default) or warn based on optional argument.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[USE_SYN_SDC]",
+            "value-choices": [
+                "warn",
+                "error"
+            ],
+            "command-line-flag": "--use-syn-sdc"
+        },
+        {
+            "id": "force_syn",
+            "name": "force_syn",
+            "description": "EXPERIMENTAL/TEMPORARY: Use SyN correction in addition to fieldmap correction, if available",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[FORCE_SYN]",
+            "command-line-flag": "--force-syn"
+        },
+        {
+            "id": "fs_license_file",
+            "name": "fs_license_file",
+            "description": "Path to FreeSurfer license key file. Get it (for free) by registering at https://surfer.nmr.mgh.harvard.edu/registration.html",
+            "optional": true,
+            "type": "String",
+            "value-key": "[FS_LICENSE_FILE]",
+            "command-line-flag": "--fs-license-file"
+        },
+        {
+            "id": "fs_subjects_dir",
+            "name": "fs_subjects_dir",
+            "description": "Path to existing FreeSurfer subjects directory to reuse. (default: OUTPUT_DIR/freesurfer)",
+            "optional": true,
+            "type": "String",
+            "value-key": "[FS_SUBJECTS_DIR]",
+            "command-line-flag": "--fs-subjects-dir"
+        },
+        {
+            "id": "hires",
+            "name": "hires",
+            "description": "Disable sub-millimeter (hires) reconstruction",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[HIRES]",
+            "command-line-flag": "--no-submm-recon"
+        },
+        {
+            "id": "skip_reconall",
+            "name": "skip_reconall",
+            "description": "Disable FreeSurfer surface preprocessing.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[SKIP_RECONALL]",
+            "command-line-flag": "--fs-no-reconall"
+        },
+        {
+            "id": "track_carbon",
+            "name": "track_carbon",
+            "description": "Tracks power draws using CodeCarbon package",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[TRACK_CARBON]",
+            "command-line-flag": "--track-carbon"
+        },
+        {
+            "id": "country_code",
+            "name": "country_code",
+            "description": "Country ISO code used by carbon trackers",
+            "optional": true,
+            "type": "String",
+            "value-key": "[COUNTRY_CODE]",
+            "default-value": "CAN",
+            "command-line-flag": "--country-code"
+        },
+        {
+            "id": "version",
+            "name": "version",
+            "description": "show program's version number and exit",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[VERSION]",
+            "command-line-flag": "--version"
+        },
+        {
+            "id": "verbose_count",
+            "name": "verbose_count",
+            "description": "Increases log verbosity for each occurrence, debug level is -vvv",
+            "optional": true,
+            "type": "String",
+            "value-key": "[VERBOSE_COUNT]",
+            "command-line-flag": "-v",
+            "value-choices": [
+                "-v",
+                "-vv",
+                "-vvv"
+            ]
+        },
+        {
+            "id": "work_dir",
+            "name": "work_dir",
+            "description": "Path where intermediate results should be stored",
+            "optional": true,
+            "type": "String",
+            "value-key": "[WORK_DIR]",
+            "command-line-flag": "-w"
+        },
+        {
+            "id": "clean_workdir",
+            "name": "clean_workdir",
+            "description": "Clears working directory of contents. Use of this flag is not recommended when running concurrent processes of fMRIPrep.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[CLEAN_WORKDIR]",
+            "command-line-flag": "--clean-workdir"
+        },
+        {
+            "id": "resource_monitor",
+            "name": "resource_monitor",
+            "description": "Enable Nipype's resource monitoring to keep track of memory and CPU usage",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[RESOURCE_MONITOR]",
+            "command-line-flag": "--resource-monitor"
+        },
+        {
+            "id": "config_file",
+            "name": "config_file",
+            "description": "Use pre-generated configuration file. Values in file will be overridden by command-line arguments.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[CONFIG_FILE]",
+            "command-line-flag": "--config-file"
+        },
+        {
+            "id": "write_graph",
+            "name": "write_graph",
+            "description": "Write workflow graph.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[WRITE_GRAPH]",
+            "command-line-flag": "--write-graph"
+        },
+        {
+            "id": "stop_on_first_crash",
+            "name": "stop_on_first_crash",
+            "description": "Force stopping on first crash, even if a work directory was specified.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[STOP_ON_FIRST_CRASH]",
+            "command-line-flag": "--stop-on-first-crash"
+        },
+        {
+            "id": "notrack",
+            "name": "notrack",
+            "description": "Opt-out of sending tracking information of this run to the FMRIPREP developers. This information helps to improve FMRIPREP and provides an indicator of real world usage crucial for obtaining funding.",
+            "optional": true,
+            "type": "Flag",
+            "value-key": "[NOTRACK]",
+            "command-line-flag": "--notrack"
+        },
+        {
+            "id": "debug",
+            "name": "debug",
+            "description": "Debug mode(s) to enable. 'all' is alias for all available modes.",
+            "optional": true,
+            "type": "String",
+            "value-key": "[DEBUG]",
+            "list": true,
+            "value-choices": [
+                "compcor",
+                "fieldmaps",
+                "pdb",
+                "all"
+            ],
+            "command-line-flag": "--debug"
+        }
+    ],
+    "tags": {},
+    "suggested-resources": {
+        "cpu-cores": 8,
+        "ram": 8,
+        "walltime-estimate": 480
+    },
+    "custom": {
+        "nipoppy": {
+            "CONTAINER_CONFIG": {
+                "ARGS": [
+                    "--bind",
+                    "[[NIPOPPY_DPATH_DERIVATIVES]]/freesurfer/7.3.2/output/[[NIPOPPY_BIDS_SESSION_ID]]"
+                ]
+            }
+        },
+        "nipoppy_old": {
+            "paths_to_tar": [
+                "[[NIPOPPY_DPATH_PIPELINE_OUTPUT]]/sub-[[NIPOPPY_PARTICIPANT_ID]]/ses-[[NIPOPPY_BIDS_SESSION_ID]]",
+                "[[NIPOPPY_DPATH_DERIVATIVES]]/freesurfer/7.3.2/output/[[NIPOPPY_BIDS_SESSION_ID]]/sub-[[NIPOPPY_PARTICIPANT_ID]]"
+            ]
+        }
+    }
+}
diff --git a/task_dFC/run_scripts_slurm/global_config.json b/task_dFC/run_scripts_slurm/global_config.json
index a99d2d7..d818ab3 100644
--- a/task_dFC/run_scripts_slurm/global_config.json
+++ b/task_dFC/run_scripts_slurm/global_config.json
@@ -1,7 +1,13 @@
 {
     "DATASET_NAME": "<DATASET_NAME>",
-    "VISIT_IDS": [],
-    "SESSION_IDS": [],
+    "VISIT_IDS": [
+        "<VISIT_LABEL>",
+        "<OTHER_VISIT_LABEL>"
+    ],
+    "SESSION_IDS": [
+        "<SESSION_LABEL>",
+        "<OTHER_SESSION_LABEL>"
+    ],
     "SUBSTITUTIONS": {
         "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy",
         "[[HEUDICONV_HEURISTIC_FILE]]": "",
@@ -27,13 +33,13 @@
             "STEPS": [
                 {
                     "NAME": "prepare",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json"
                 },
                 {
                     "NAME": "convert",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json",
                     "CONTAINER_CONFIG": {
                         "ARGS": [
                             "--bind",
@@ -54,13 +60,13 @@
             "STEPS": [
                 {
                     "NAME": "prepare",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/dcm2bids_helper-[[PIPELINE_VERSION]].json"
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-dcm2bids_helper.json"
                 },
                 {
                     "NAME": "convert",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/dcm2bids-[[PIPELINE_VERSION]].json",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-dcm2bids.json",
                     "CONTAINER_CONFIG": {
                         "ARGS": [
                             "--bind",
@@ -77,20 +83,20 @@
             "STEPS": [
                 {
                     "NAME": "prepare",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidsmapper-[[PIPELINE_VERSION]].json",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-bidsmapper.json",
                     "ANALYSIS_LEVEL": "group"
                 },
                 {
                     "NAME": "edit",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidseditor-[[PIPELINE_VERSION]].json",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-bidseditor.json",
                     "ANALYSIS_LEVEL": "group"
                 },
                 {
                     "NAME": "convert",
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]-[[STEP_NAME]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/bidscoiner-[[PIPELINE_VERSION]].json",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-bidscoiner.json",
                     "ANALYSIS_LEVEL": "participant",
                     "UPDATE_DOUGHNUT": true
                 }
@@ -118,17 +124,22 @@
             },
             "STEPS": [
                 {
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation.json",
+                    "GENERATE_PYBIDS_DATABASE": false,
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json",
+                    "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/tracker_config.json"
                 }
-            ],
-            "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+            ]
         },
         {
             "NAME": "freesurfer",
             "VERSION": "7.3.2",
             "DESCRIPTION": "Freesurfer version associated with fMRIPrep 23.1.3",
-            "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+            "STEPS": [
+                {
+                    "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/tracker_config.json"
+                }
+            ]
         },
         {
             "NAME": "mriqc",
@@ -148,11 +159,11 @@
             },
             "STEPS": [
                 {
-                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_INVOCATIONS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json",
-                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_DESCRIPTORS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation.json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json",
+                    "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/tracker_config.json"
                 }
-            ],
-            "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_TRACKER_CONFIGS]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]].json"
+            ]
         }
     ],
     "CUSTOM": {}
diff --git a/task_dFC/run_scripts_slurm/invocation.json b/task_dFC/run_scripts_slurm/invocation.json
new file mode 100644
index 0000000..3cb59ac
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/invocation.json
@@ -0,0 +1,24 @@
+{
+    "bids_dir": "[[NIPOPPY_DPATH_BIDS]]",
+    "output_dir": "[[NIPOPPY_DPATH_PIPELINE_OUTPUT]]",
+    "analysis_level": "participant",
+    "skip_bids_validation": true,
+    "participant_label": [
+        "[[NIPOPPY_PARTICIPANT_ID]]"
+    ],
+    "bids_database_dir": "[[NIPOPPY_DPATH_PIPELINE_BIDS_DB]]",
+    "nprocs": "16",
+    "omp_nthreads": "8",
+    "memory_gb": "32G",
+    "anat_only": false,
+    "output_spaces": [
+        "MNI152NLin2009cAsym:res-2"
+    ],
+    "regressors_all_comps": true,
+    "fs_license_file": "[[FREESURFER_LICENSE_FILE]]",
+    "fs_subjects_dir": "[[NIPOPPY_DPATH_DERIVATIVES]]/freesurfer/7.3.2/output/[[NIPOPPY_BIDS_SESSION_ID]]",
+    "verbose_count": "-v",
+    "work_dir": "/work",
+    "write_graph": true,
+    "notrack": true
+}

From bea48dbc641fdd1e003e4b5112d8436d4f89e559 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 28 Nov 2024 10:41:05 -0500
Subject: [PATCH 144/401] add GLM to report

---
 task_dFC/generate_report.py | 177 +++++++++++++++++++++++++++++++++++-
 1 file changed, 176 insertions(+), 1 deletion(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 301cb5e..38b5c82 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -6,7 +6,8 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-from nilearn import image, plotting
+from nilearn import image, masking, plotting
+from nilearn.glm.first_level import FirstLevelModel
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
@@ -88,6 +89,33 @@ def load_task_data(roi_root, subj, task, run=None, session=None):
     return task_data
 
 
+def get_func_data(fmriprep_root, subj, task, bold_suffix, run=None, session=None):
+    if session is None:
+        ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
+    else:
+        ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/{session}/func/")
+
+    ALL_TASK_FILES = [
+        file_i
+        for file_i in ALL_TASK_FILES
+        if (bold_suffix in file_i) and (f"_{task}_" in file_i)
+    ]
+
+    if not len(ALL_TASK_FILES) >= 1:
+        return None
+
+    if run is None:
+        task_file = ALL_TASK_FILES[0]
+    else:
+        task_file = [file_i for file_i in ALL_TASK_FILES if f"_{run}_" in file_i][0]
+    if session is None:
+        func_file = f"{fmriprep_root}/{subj}/func/{task_file}"
+    else:
+        func_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}"
+
+    return func_file
+
+
 # def plot_anatomical(
 #     fmriprep_root,
 #     subj,
@@ -126,6 +154,115 @@ def load_task_data(roi_root, subj, task, run=None, session=None):
 #     display = plotting.plot_anat(mean_func_img, title="plot_func")
 
 
+def get_events_df(events, trial_type_label="trial_type", rest_labels=["rest", "Rest"]):
+    # find which column is the "onset" in the first row
+    onset_idx = np.where(events[0, :] == "onset")[0][0]
+    duration_idx = np.where(events[0, :] == "duration")[0][0]
+    if trial_type_label is not None:
+        trial_type_idx = np.where(events[0, :] == trial_type_label)[0][0]
+
+    # assign the time between active onsets to 'rest'
+    events_new = []
+    events_new.append(events[0, [onset_idx, duration_idx, trial_type_idx]])
+    prev_onset = 0.0
+    for i in range(1, events.shape[0]):
+
+        if events[i, trial_type_idx] in rest_labels:
+            continue
+
+        current_onset = float(events[i, onset_idx])
+        current_duration = float(events[i, duration_idx])
+        rest_duration = current_onset - prev_onset
+        if rest_duration > 0.0:
+            events_new.append([prev_onset, rest_duration, "rest"])
+        events_new.append([current_onset, current_duration, "active"])
+        prev_onset = current_onset + current_duration
+
+    events_new = np.array(events_new)
+
+    # convert to pandas dataframe
+    events_df = pd.DataFrame(
+        events_new[1:, :], columns=["onset", "duration", "trial_type"]
+    )
+
+    return events_df
+
+
+def plot_glm(
+    fmriprep_root,
+    roi_root,
+    subj,
+    task,
+    bold_suffix,
+    trial_type_label,
+    rest_labels,
+    output_root,
+    run=None,
+    session=None,
+):
+
+    func_file = get_func_data(
+        fmriprep_root=fmriprep_root,
+        subj=subj,
+        task=task,
+        bold_suffix=bold_suffix,
+        run=run,
+        session=session,
+    )
+    task_data = load_task_data(roi_root, subj, task, run, session)
+    TR_mri = task_data["TR_mri"]
+
+    events_df = get_events_df(
+        events=task_data["events"],
+        trial_type_label=trial_type_label,
+        rest_labels=rest_labels,
+    )
+
+    # Make an average
+    mean_img = image.mean_img(func_file)
+    mask = masking.compute_epi_mask(mean_img)
+
+    # Clean and smooth data
+    fmri_img = image.clean_img(func_file, standardize=False)
+    fmri_img = image.smooth_img(fmri_img, 5.0)
+
+    fmri_glm = FirstLevelModel(
+        t_r=TR_mri,
+        drift_model="cosine",
+        signal_scaling=False,
+        mask_img=mask,
+        minimize_memory=False,
+    )
+
+    fmri_glm = fmri_glm.fit(fmri_img, events_df)
+
+    z_map = fmri_glm.compute_contrast("active - rest")
+
+    plotting.plot_stat_map(z_map, bg_img=mean_img, threshold=3.1)
+
+    # save the figure
+    output_dir = f"{output_root}/subject_results/{subj}/GLM"
+    if session is not None:
+        output_dir = f"{output_dir}/{session}"
+    output_dir = f"{output_dir}/{task}"
+    if run is not None:
+        output_dir = f"{output_dir}/{run}"
+    output_dir = f"{output_dir}/"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    plt.savefig(
+        f"{output_dir}/glm.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
+
+
 def plot_roi_signals(
     roi_root,
     subj,
@@ -1086,6 +1223,19 @@ def create_html_report_subj_results(
 
                 img_height = 100
 
+                # display GLM
+                glm_img = f"{subj_dir}/GLM/{session_task_run_dir}/glm.png"
+                img = plt.imread(glm_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                glm_img = glm_img.replace(subj_dir, ".")
+                file.write(
+                    f"<img src='{glm_img}' alt='GLM' width='{width}' height='{img_height}'>\n"
+                )
+                file.write("<br>\n")
+
                 # display ROI signals
                 ROI_signals_img = (
                     f"{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png"
@@ -1459,6 +1609,15 @@ def create_html_report_group_results(
     else:
         main_root = dataset_info["main_root"]
 
+    if "{main_root}" in dataset_info["fmriprep_root"]:
+        fmriprep_root = dataset_info["fmriprep_root"].replace("{main_root}", main_root)
+    elif "{dataset}" in dataset_info["fmriprep_root"]:
+        fmriprep_root = dataset_info["fmriprep_root"].replace(
+            "{dataset}", dataset_info["dataset"]
+        )
+    else:
+        fmriprep_root = dataset_info["fmriprep_root"]
+
     if "{main_root}" in dataset_info["roi_root"]:
         roi_root = dataset_info["roi_root"].replace("{main_root}", main_root)
     else:
@@ -1507,6 +1666,22 @@ def create_html_report_group_results(
                     except Exception as e:
                         print(f"Error in plotting dFC matrices: {e}")
 
+                    try:
+                        plot_glm(
+                            fmriprep_root=fmriprep_root,
+                            roi_root=roi_root,
+                            subj=subj,
+                            task=task,
+                            bold_suffix=dataset_info["bold_suffix"],
+                            trial_type_label=dataset_info["trial_type_label"],
+                            rest_label=dataset_info["rest_labels"],
+                            output_root=reports_root,
+                            run=run,
+                            session=session,
+                        )
+                    except Exception as e:
+                        print(f"Error in plotting GLM: {e}")
+
                     try:
                         plot_roi_signals(
                             roi_root=roi_root,

From 0eb47f5004074bedc5f9e2c0144ae944ed5b2ece Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 28 Nov 2024 11:52:15 -0500
Subject: [PATCH 145/401] minor fix

---
 task_dFC/generate_report.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 38b5c82..509c8ba 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1674,7 +1674,7 @@ def create_html_report_group_results(
                             task=task,
                             bold_suffix=dataset_info["bold_suffix"],
                             trial_type_label=dataset_info["trial_type_label"],
-                            rest_label=dataset_info["rest_labels"],
+                            rest_labels=dataset_info["rest_labels"],
                             output_root=reports_root,
                             run=run,
                             session=session,

From def6a51978d6c032c846d6aba362fdce3201a1bf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 28 Nov 2024 21:59:09 -0500
Subject: [PATCH 146/401] minor fix

---
 task_dFC/generate_report.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 509c8ba..a040c21 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -163,12 +163,12 @@ def get_events_df(events, trial_type_label="trial_type", rest_labels=["rest", "R
 
     # assign the time between active onsets to 'rest'
     events_new = []
-    events_new.append(events[0, [onset_idx, duration_idx, trial_type_idx]])
     prev_onset = 0.0
     for i in range(1, events.shape[0]):
 
-        if events[i, trial_type_idx] in rest_labels:
-            continue
+        if trial_type_label is not None:
+            if events[i, trial_type_idx] in rest_labels:
+                continue
 
         current_onset = float(events[i, onset_idx])
         current_duration = float(events[i, duration_idx])
@@ -181,9 +181,7 @@ def get_events_df(events, trial_type_label="trial_type", rest_labels=["rest", "R
     events_new = np.array(events_new)
 
     # convert to pandas dataframe
-    events_df = pd.DataFrame(
-        events_new[1:, :], columns=["onset", "duration", "trial_type"]
-    )
+    events_df = pd.DataFrame(events_new, columns=["onset", "duration", "trial_type"])
 
     return events_df
 

From f9b98653c098256c6b0e9086a0e915dcf50cfa91 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 29 Nov 2024 10:51:12 -0500
Subject: [PATCH 147/401] change run scripts

---
 task_dFC/run_scripts_slurm/run_FCS.sh          | 1 -
 task_dFC/run_scripts_slurm/run_ML.sh           | 1 -
 task_dFC/run_scripts_slurm/run_dFC.sh          | 1 -
 task_dFC/run_scripts_slurm/run_fmriprep.sh     | 1 -
 task_dFC/run_scripts_slurm/run_nifti_to_roi.sh | 1 -
 task_dFC/run_scripts_slurm/run_report.sh       | 1 -
 6 files changed, 6 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index 7ef0058..9f65e88 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=fit_fcs_job   # Optional: Name of your job
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
-#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=96:00:00                # Walltime for each task (96 hours)
 #SBATCH --mem=64G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index da8c6cc..f90d030 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=ML_job   # Optional: Name of your job
 #SBATCH --output=logs/ML_out.txt  # Standard output log
 #SBATCH --error=logs/ML_err.txt   # Standard error log
-#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=72:00:00                # Walltime for each task (72 hours)
 #SBATCH --mem=70G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
index e329fd0..9f55ace 100644
--- a/task_dFC/run_scripts_slurm/run_dFC.sh
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=assess_dfc_job   # Optional: Name of your job
 #SBATCH --output=logs/dfc_out.txt  # Standard output log
 #SBATCH --error=logs/dfc_err.txt   # Standard error log
-#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=32G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 0abfdbc..d58fa74 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -6,7 +6,6 @@
 #SBATCH --time=2-00:00:00                # Walltime (2 day)
 #SBATCH --mem-per-cpu=16G                # Memory (16 GB) per cpu
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
-#SBATCH --account=rrg-jbpoline           # Account
 
 module load apptainer
 
diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 6d1d88d..8a4d1d9 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=extract_roi_job   # Optional: Name of your job
 #SBATCH --output=logs/roi_out.txt  # Standard output log
 #SBATCH --error=logs/roi_err.txt   # Standard error log
-#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=64G                     # Memory request per node
 
diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
index 57b6634..fa8032a 100644
--- a/task_dFC/run_scripts_slurm/run_report.sh
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=report_job   # Optional: Name of your job
 #SBATCH --output=logs/report_out.txt  # Standard output log
 #SBATCH --error=logs/report_err.txt   # Standard error log
-#SBATCH --account=rrg-jbpoline           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=16G                     # Memory request per node
 

From a5ec5f02be5b5a0bc049ea35bace8dfbd3ced897 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 29 Nov 2024 10:57:04 -0500
Subject: [PATCH 148/401] change run scripts

---
 task_dFC/run_scripts_slurm/run_report.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
index fa8032a..10dba88 100644
--- a/task_dFC/run_scripts_slurm/run_report.sh
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -4,7 +4,7 @@
 #SBATCH --output=logs/report_out.txt  # Standard output log
 #SBATCH --error=logs/report_err.txt   # Standard error log
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
-#SBATCH --mem=16G                     # Memory request per node
+#SBATCH --mem=32G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 SUBJ_LIST="./subj_list.txt"

From 944206b4302c12b90f2fbe3212411ab8f8c40629 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 29 Nov 2024 15:15:54 -0500
Subject: [PATCH 149/401] change FCS est

---
 task_dFC/FCS_estimate.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index e54ef11..896c79b 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -75,18 +75,22 @@ def run_FCS_estimate(
 
     for MEASURE_id, measure in enumerate(MEASURES_lst):
 
-        print("MEASURE: " + measure.measure_name)
-        print("FCS estimation started...")
-
-        if measure.is_state_based:
-            measure.estimate_FCS(time_series=BOLD)
-
-        print("FCS estimation done.")
-
-        # Save
-        if not os.path.exists(f"{output_dir}"):
-            os.makedirs(f"{output_dir}")
-        np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
+        try:
+            print("MEASURE: " + measure.measure_name)
+            print("FCS estimation started...")
+
+            if measure.is_state_based:
+                measure.estimate_FCS(time_series=BOLD)
+
+            print("FCS estimation done.")
+
+            # Save
+            if not os.path.exists(f"{output_dir}"):
+                os.makedirs(f"{output_dir}")
+            np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
+        except Exception as e:
+            print(f"Error in MEASURE: {measure.measure_name}")
+            print(e)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")
     np.save(f"{output_dir}/multi-analysis_{file_suffix}.npy", MA)

From 8b35c45fcd18c8faa098ca48f376cfafe589be67 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 29 Nov 2024 15:16:35 -0500
Subject: [PATCH 150/401] minor

---
 task_dFC/run_scripts_slurm/run_report.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
index 10dba88..f094835 100644
--- a/task_dFC/run_scripts_slurm/run_report.sh
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -4,7 +4,7 @@
 #SBATCH --output=logs/report_out.txt  # Standard output log
 #SBATCH --error=logs/report_err.txt   # Standard error log
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
-#SBATCH --mem=32G                     # Memory request per node
+#SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 SUBJ_LIST="./subj_list.txt"

From 1db7ce3fe75b22229fb062a9221de964e236aae9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 11:33:08 -0500
Subject: [PATCH 151/401] change run_FCS

---
 task_dFC/run_scripts_slurm/run_FCS.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index 9f65e88..3fc48ae 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fit_fcs_job   # Optional: Name of your job
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
-#SBATCH --time=96:00:00                # Walltime for each task (96 hours)
+#SBATCH --time=4-00:00:00                # Walltime for each task (4 days)
 #SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"

From 1be09cb1268689b9e7b2efec1116648f79a72dc7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 11:37:15 -0500
Subject: [PATCH 152/401] change num_state to 12

---
 task_dFC/run_scripts_sge/methods_config.json   | 2 +-
 task_dFC/run_scripts_slurm/methods_config.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_sge/methods_config.json b/task_dFC/run_scripts_sge/methods_config.json
index d4013d4..0139d62 100644
--- a/task_dFC/run_scripts_sge/methods_config.json
+++ b/task_dFC/run_scripts_sge/methods_config.json
@@ -8,7 +8,7 @@
         "clstr_base_measure": "SlidingWindow",
         "hmm_iter": 20,
         "dhmm_obs_state_ratio": 0.666,
-        "n_states": 5,
+        "n_states": 12,
         "n_subj_clstrs": 10,
         "n_jobs": 2,
         "verbose": 0,
diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index d4013d4..0139d62 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -8,7 +8,7 @@
         "clstr_base_measure": "SlidingWindow",
         "hmm_iter": 20,
         "dhmm_obs_state_ratio": 0.666,
-        "n_states": 5,
+        "n_states": 12,
         "n_subj_clstrs": 10,
         "n_jobs": 2,
         "verbose": 0,

From 7d4102570e4a90a828edd69941e7c7683a246efc Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 12:53:29 -0500
Subject: [PATCH 153/401] change run_FCS

---
 task_dFC/run_scripts_slurm/run_FCS.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index 3fc48ae..11c5422 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fit_fcs_job   # Optional: Name of your job
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
-#SBATCH --time=4-00:00:00                # Walltime for each task (4 days)
+#SBATCH --time=7-00:00:00                # Walltime for each task (7 days)
 #SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"

From 55471c3d97a060ff6cab02b6b75631cf2133f38b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 14:14:56 -0500
Subject: [PATCH 154/401] refactor dFC assess

---
 task_dFC/FCS_estimate.py   | 12 ++----------
 task_dFC/dFC_assessment.py |  8 ++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 896c79b..81b4ce4 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from pydfc import MultiAnalysis, data_loader
+from pydfc import data_loader, multi_analysis_utils
 
 warnings.simplefilter("ignore")
 
@@ -21,7 +21,6 @@ def run_FCS_estimate(
     params_methods,
     MEASURES_name_lst,
     alter_hparams,
-    params_multi_analysis,
     task,
     roi_root,
     output_root,
@@ -60,11 +59,7 @@ def run_FCS_estimate(
     )
     ################################ Measures of dFC #################################
 
-    MA = MultiAnalysis(
-        analysis_name=f"task-based-dFC-{file_suffix}", **params_multi_analysis
-    )
-
-    MEASURES_lst = MA.measures_initializer(
+    MEASURES_lst = multi_analysis_utils.measures_initializer(
         MEASURES_name_lst, params_methods, alter_hparams
     )
 
@@ -93,7 +88,6 @@ def run_FCS_estimate(
             print(e)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-    np.save(f"{output_dir}/multi-analysis_{file_suffix}.npy", MA)
 
 
 ########################################################################################
@@ -173,7 +167,6 @@ def run_FCS_estimate(
     params_methods = methods_config["params_methods"]
     MEASURES_name_lst = methods_config["MEASURES_name_lst"]
     alter_hparams = methods_config["alter_hparams"]
-    params_multi_analysis = methods_config["params_multi_analysis"]
 
     for session in SESSIONS:
         for run in RUNS[task]:
@@ -181,7 +174,6 @@ def run_FCS_estimate(
                 params_methods=params_methods,
                 MEASURES_name_lst=MEASURES_name_lst,
                 alter_hparams=alter_hparams,
-                params_multi_analysis=params_multi_analysis,
                 task=task,
                 roi_root=roi_root,
                 output_root=fitted_measures_root,
diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index 06253ac..caeb0c9 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -149,17 +149,23 @@ def run_dFC_assess(
     parser = argparse.ArgumentParser(description=HELPTEXT)
 
     parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+    parser.add_argument("--methods_config", type=str, help="methods config file")
     parser.add_argument("--participant_id", type=str, help="participant id")
 
     args = parser.parse_args()
 
     dataset_info_file = args.dataset_info
+    methods_config_file = args.methods_config
     participant_id = args.participant_id
 
     # Read dataset info
     with open(dataset_info_file, "r") as f:
         dataset_info = json.load(f)
 
+    # Read methods config
+    with open(methods_config_file, "r") as f:
+        methods_config = json.load(f)
+
     print(
         f"subject-level dFC assessment CODE started running ... for subject: {participant_id} ..."
     )
@@ -204,6 +210,8 @@ def run_dFC_assess(
     if RUNS is None:
         RUNS = {task: [None] for task in TASKS}
 
+    params_multi_analysis = methods_config["params_multi_analysis"]
+
     for session in SESSIONS:
         for task in TASKS:
             for run in RUNS[task]:

From 32ad496cade9e87fb5e0532d8eefc4dc0006cb09 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 14:40:42 -0500
Subject: [PATCH 155/401] paralell change in dfc assess

---
 task_dFC/FCS_estimate.py                      | 32 +++++++++----------
 task_dFC/dFC_assessment.py                    | 18 ++++++-----
 task_dFC/run_scripts_sge/methods_config.json  |  2 +-
 .../run_scripts_slurm/methods_config.json     |  2 +-
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 81b4ce4..2478817 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -21,6 +21,7 @@ def run_FCS_estimate(
     params_methods,
     MEASURES_name_lst,
     alter_hparams,
+    params_multi_analysis,
     task,
     roi_root,
     output_root,
@@ -68,24 +69,19 @@ def run_FCS_estimate(
 
     ################################# estimate FCS #################################
 
-    for MEASURE_id, measure in enumerate(MEASURES_lst):
-
-        try:
-            print("MEASURE: " + measure.measure_name)
-            print("FCS estimation started...")
-
-            if measure.is_state_based:
-                measure.estimate_FCS(time_series=BOLD)
-
-            print("FCS estimation done.")
+    MEASURES_fit_lst = multi_analysis_utils.estimate_group_FCS(
+        time_series=BOLD,
+        MEASURES_lst=MEASURES_lst,
+        n_jobs=params_multi_analysis["n_jobs"],
+        verbose=params_multi_analysis["verbose"],
+        backend=params_multi_analysis["backend"],
+    )
 
-            # Save
-            if not os.path.exists(f"{output_dir}"):
-                os.makedirs(f"{output_dir}")
-            np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
-        except Exception as e:
-            print(f"Error in MEASURE: {measure.measure_name}")
-            print(e)
+    # Save the fitted measures
+    for MEASURE_id, measure in enumerate(MEASURES_fit_lst):
+        if not os.path.exists(f"{output_dir}"):
+            os.makedirs(f"{output_dir}")
+        np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")
 
@@ -167,6 +163,7 @@ def run_FCS_estimate(
     params_methods = methods_config["params_methods"]
     MEASURES_name_lst = methods_config["MEASURES_name_lst"]
     alter_hparams = methods_config["alter_hparams"]
+    params_multi_analysis = methods_config["params_multi_analysis"]
 
     for session in SESSIONS:
         for run in RUNS[task]:
@@ -174,6 +171,7 @@ def run_FCS_estimate(
                 params_methods=params_methods,
                 MEASURES_name_lst=MEASURES_name_lst,
                 alter_hparams=alter_hparams,
+                params_multi_analysis=params_multi_analysis,
                 task=task,
                 roi_root=roi_root,
                 output_root=fitted_measures_root,
diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index caeb0c9..1ca06ad 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from pydfc import MultiAnalysis, data_loader
+from pydfc import data_loader, multi_analysis_utils
 
 warnings.simplefilter("ignore")
 
@@ -23,6 +23,7 @@ def run_dFC_assess(
     roi_root,
     fitted_measures_root,
     output_root,
+    params_multi_analysis,
     session=None,
     run=None,
 ):
@@ -89,11 +90,6 @@ def run_dFC_assess(
         return
     ################################# LOAD FIT MEASURES #################################
 
-    MA = np.load(
-        f"{fitted_measures_dir}/multi-analysis_{file_suffix}.npy",
-        allow_pickle="TRUE",
-    ).item()
-
     ALL_RECORDS = os.listdir(f"{fitted_measures_dir}/")
     ALL_RECORDS = [
         i for i in ALL_RECORDS if ("MEASURE" in i) and (f"_{file_suffix}_" in i)
@@ -103,7 +99,6 @@ def run_dFC_assess(
     for s in ALL_RECORDS:
         fit_measure = np.load(f"{fitted_measures_dir}/{s}", allow_pickle="TRUE").item()
         MEASURES_fit_lst.append(fit_measure)
-    MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
     print("fitted MEASURES are loaded ...")
 
     ################################# LOAD DATA #################################
@@ -123,7 +118,13 @@ def run_dFC_assess(
     print("Measurement Started ...")
 
     print("dFC estimation started...")
-    dFC_dict = MA.subj_lvl_dFC_assess(time_series=BOLD)
+    dFC_dict = multi_analysis_utils.subj_lvl_dFC_assess(
+        time_series=BOLD,
+        MEASURES_fit_lst=MEASURES_fit_lst,
+        n_jobs=params_multi_analysis["n_jobs"],
+        verbose=params_multi_analysis["verbose"],
+        backend=params_multi_analysis["backend"],
+    )
     print("dFC estimation done.")
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")
@@ -221,6 +222,7 @@ def run_dFC_assess(
                     roi_root=roi_root,
                     fitted_measures_root=fitted_measures_root,
                     output_root=output_root,
+                    params_multi_analysis=params_multi_analysis,
                     session=session,
                     run=run,
                 )
diff --git a/task_dFC/run_scripts_sge/methods_config.json b/task_dFC/run_scripts_sge/methods_config.json
index 0139d62..ee96381 100644
--- a/task_dFC/run_scripts_sge/methods_config.json
+++ b/task_dFC/run_scripts_sge/methods_config.json
@@ -28,7 +28,7 @@
     ],
     "alter_hparams" : [],
     "params_multi_analysis" : {
-        "n_jobs": null,
+        "n_jobs": 8,
         "verbose": 0,
         "backend": "loky"
     }
diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index 0139d62..ee96381 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -28,7 +28,7 @@
     ],
     "alter_hparams" : [],
     "params_multi_analysis" : {
-        "n_jobs": null,
+        "n_jobs": 8,
         "verbose": 0,
         "backend": "loky"
     }

From 601623e845837163512245f23a0b40f59e44d290 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 14:55:55 -0500
Subject: [PATCH 156/401] minor

---
 task_dFC/run_scripts_slurm/run_FCS.sh | 3 ++-
 task_dFC/run_scripts_slurm/run_dFC.sh | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index 11c5422..c0de268 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -4,7 +4,8 @@
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
 #SBATCH --time=7-00:00:00                # Walltime for each task (7 days)
-#SBATCH --mem=64G                     # Memory request per node
+#SBATCH --mem-per-cpu=64G                # Memory (64 GB) per cpu
+#SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 
 DATASET_INFO="./dataset_info.json"
 METHODS_CONFIG="./methods_config.json"
diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
index 9f55ace..c785690 100644
--- a/task_dFC/run_scripts_slurm/run_dFC.sh
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -8,6 +8,7 @@
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
+METHODS_CONFIG="./methods_config.json"
 
 echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 
@@ -19,6 +20,7 @@ source "/home/mt00/venvs/pydfc/bin/activate"
 
 python "/home/mt00/pydfc/dFC/task_dFC/dFC_assessment.py" \
 --dataset_info $DATASET_INFO \
+--methods_config $METHODS_CONFIG \
 --participant_id $SUBJECT_ID
 
 deactivate

From d43597a1b60e9c911bd00213b9ade724479c62ce Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 16:39:44 -0500
Subject: [PATCH 157/401] add cluster_for_visual

---
 pydfc/ml_utils.py | 50 +++++++++++++++++++++++++++++++++
 task_dFC/ML.py    | 71 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index c53d8b9..ea789d0 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1426,3 +1426,53 @@ def task_paradigm_clustering(
         }
 
     return task_paradigm_clstr_RESULTS
+
+
+def cluster_for_visual(
+    task,
+    dFC_id,
+    roi_root,
+    dFC_root,
+    run=None,
+    session=None,
+    normalize_dFC=True,
+):
+    if run is None:
+        print(f"=============== {task} ===============")
+    else:
+        print(f"=============== {task} {run} ===============")
+
+    SUBJECTS = find_available_subjects(
+        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
+    )
+
+    print(f"Number of subjects: {len(SUBJECTS)}")
+
+    X, _, _, _, _, _, measure_name = dFC_feature_extraction(
+        task=task,
+        train_subjects=SUBJECTS,
+        test_subjects=[],
+        dFC_id=dFC_id,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        run=run,
+        session=session,
+        dynamic_pred="no",
+        normalize_dFC=normalize_dFC,
+    )
+
+    # clustering
+    # apply kmeans clustering to dFC features
+    n_clusters = 12
+
+    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
+    kmeans.fit(X)
+
+    # get centroids
+    centroids = kmeans.cluster_centers_
+    n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
+    centroids_mat = dFC_vec2mat(
+        centroids, n_regions
+    )  # shape: n_clusters x n_regions x n_regions
+
+    return centroids_mat, measure_name
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index c965319..1f1fdef 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from pydfc.ml_utils import (
+    cluster_for_visual,
     extract_task_features,
     task_paradigm_clustering,
     task_presence_classification,
@@ -224,6 +225,57 @@ def run_task_paradigm_clustering(
         )
 
 
+def run_clustering_for_visual(
+    dFC_id,
+    TASKS,
+    RUNS,
+    SESSIONS,
+    roi_root,
+    dFC_root,
+    output_root,
+    normalize_dFC=True,
+):
+    for session in SESSIONS:
+        if not session is None:
+            print(f"=================== {session} ===================")
+
+        for task_id, task in enumerate(TASKS):
+            for run in RUNS[task]:
+                try:
+                    centroids_mat, measure_name = cluster_for_visual(
+                        task=task,
+                        dFC_id=dFC_id,
+                        roi_root=roi_root,
+                        dFC_root=dFC_root,
+                        run=run,
+                        session=session,
+                        normalize_dFC=normalize_dFC,
+                    )
+
+                    # save the centroids
+                    suffix = "centroids"
+                    if session is not None:
+                        suffix = f"{suffix}_{session}"
+                    suffix = f"{suffix}_{task}"
+                    if run is not None:
+                        suffix = f"{suffix}_{run}"
+                    suffix = f"{suffix}_{measure_name}"
+
+                    if not os.path.exists(f"{output_root}/centroids"):
+                        os.makedirs(f"{output_root}/centroids")
+
+                    np.save(
+                        f"{output_root}/centroids/{suffix}.npy",
+                        centroids_mat,
+                    )
+
+                except Exception as e:
+                    print(
+                        f"Error in clustering for visualization for {session} {task} {run}: {e}"
+                    )
+                    traceback.print_exc()
+
+
 #######################################################################################
 
 if __name__ == "__main__":
@@ -356,6 +408,25 @@ def run_task_paradigm_clustering(
         traceback.print_exc()
 
     print(f"Task paradigm clustering finished for dFC ID {dFC_id}.")
+
+    print(f"Clustering for visualization started for dFC ID {dFC_id} ...")
+    try:
+        run_clustering_for_visual(
+            dFC_id=dFC_id,
+            TASKS=TASKS,
+            RUNS=RUNS,
+            SESSIONS=SESSIONS,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            output_root=ML_root,
+            normalize_dFC=True,
+        )
+    except Exception as e:
+        print(f"Error in clustering for visualization for dFC ID {dFC_id}: {e}")
+        traceback.print_exc()
+
+    print(f"Clustering for visualization finished for dFC ID {dFC_id}.")
+
     print(f"Task presence prediction finished for dFC ID {dFC_id}.")
 
 #######################################################################################

From f9599337f5318e31329c220d3a549a40eaf02b03 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 18:05:42 -0500
Subject: [PATCH 158/401] add centroids visual

---
 task_dFC/ML.py              |  10 ++--
 task_dFC/generate_report.py | 103 ++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 1f1fdef..2cfed52 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -261,11 +261,15 @@ def run_clustering_for_visual(
                         suffix = f"{suffix}_{run}"
                     suffix = f"{suffix}_{measure_name}"
 
-                    if not os.path.exists(f"{output_root}/centroids"):
-                        os.makedirs(f"{output_root}/centroids")
+                    if session is None:
+                        folder = f"{output_root}/centroids"
+                    else:
+                        folder = f"{output_root}/{session}/centroids"
+                    if not os.path.exists(folder):
+                        os.makedirs(folder)
 
                     np.save(
-                        f"{output_root}/centroids/{suffix}.npy",
+                        f"{folder}/{suffix}.npy",
                         centroids_mat,
                     )
 
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index a040c21..6f104e2 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -964,6 +964,54 @@ def plot_paradigm_clustering_score(
     plt.close()
 
 
+def plot_clstr_visual_centroids(
+    ML_root,
+    output_root,
+    session=None,
+):
+    """ """
+    # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
+    # find all the paradigm_clustering_RESULTS files in the directory
+    if session is None:
+        input_dir = f"{ML_root}"
+    else:
+        input_dir = f"{ML_root}/{session}"
+
+    output_dir = f"{output_root}/group_results/visual_clustering_centroids"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    ALL_CENTROID_RESULTS = os.listdir(input_dir)
+    ALL_CENTROID_RESULTS = [
+        result_file for result_file in ALL_CENTROID_RESULTS if "centroids_" in result_file
+    ]
+    ALL_CENTROID_RESULTS.sort()
+
+    for result_file in ALL_CENTROID_RESULTS:
+        centroids_mats = np.load(f"{input_dir}/{result_file}", allow_pickle="TRUE")
+
+        # result_file is centroids_{session}_{task}_{run}_{measure_name}.npy
+        # suffix is whatever comes after the centroids and before .npy
+        suffix = result_file.split("centroids_")[1].split(".npy")[0]
+
+        centroids_dict = {}
+        for i, centroid_mat in enumerate(centroids_mats):
+            centroids_dict[f"Cluster {i + 1}"] = centroid_mat
+
+        visualize_conn_mat_dict(
+            data=centroids_dict,
+            title=f"visual-centroids_{suffix}",
+            cmap="seismic",
+            normalize=True,
+            disp_diag=False,
+            save_image=True,
+            output_root=f"{output_dir}/",
+            center_0=True,
+            # node_networks=None,
+        )
+
+
 # def plot_paradigm_clstr_centroids(
 #     ML_root,
 #     output_root,
@@ -1525,6 +1573,61 @@ def create_html_report_group_results(
 
             file.write("<br>\n")
 
+        # display visual clustering centroids
+        img_height = 300
+        file.write("<h2>Visual Clustering Centroids</h2>\n")
+        # find all png files in the directory
+        visual_clustering_centroids_dir = f"{group_dir}/visual_clustering_centroids"
+        for session in SESSIONS:
+            if session is not None:
+                file.write(f"<h3> {session} </h3>\n")
+            for task in TASKS:
+                file.write(f"<h3> {task} </h3>\n")
+                for run in RUNS[task]:
+                    if run is not None:
+                        file.write(f"<h3> {run} </h3>\n")
+
+                    # visual-centroids_{session}_{task}_{run}_{measure_name}.png
+                    all_centroids_img_files = os.listdir(visual_clustering_centroids_dir)
+                    all_centroids_img_files = [
+                        centroids_img_file
+                        for centroids_img_file in all_centroids_img_files
+                        if "visual-centroids" in centroids_img_file
+                        and f"_{task}" in centroids_img_file
+                    ]
+                    if session is not None:
+                        all_centroids_img_files = [
+                            centroids_img_file
+                            for centroids_img_file in all_centroids_img_files
+                            if f"_{session}" in centroids_img_file
+                        ]
+                    if run is not None:
+                        all_centroids_img_files = [
+                            centroids_img_file
+                            for centroids_img_file in all_centroids_img_files
+                            if f"_{run}" in centroids_img_file
+                        ]
+                    all_centroids_img_files.sort()
+
+                    for centroids_img_file in all_centroids_img_files:
+                        # iterate over centroids images of different measures
+                        centroid_img = (
+                            f"{visual_clustering_centroids_dir}/{centroids_img_file}"
+                        )
+                        measure_name = centroids_img_file.split("_")[-1].split(".")[0]
+                        file.write(f"<h3>{measure_name}</h3>\n")
+                        # get the original size of the image
+                        img = plt.imread(centroid_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        centroid_img = centroid_img.replace(group_dir, ".")
+                        file.write(
+                            f"<img src='{centroid_img}' alt='Visual clustering centroids' width='{width}' height='{img_height}'>\n"
+                        )
+                        file.write("<br>\n")
+
         # # display paradigm clustering centroids
         # img_height = 300
         # file.write("<h2>Paradigm Clustering Centroids</h2>\n")

From a3da6dfa932209e4f75dd986fdf5b60023fe309e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 3 Dec 2024 18:53:52 -0500
Subject: [PATCH 159/401] add task features without hrf

---
 pydfc/ml_utils.py           |  10 ++-
 pydfc/task_utils.py         |  19 +++--
 task_dFC/ML.py              |  33 ++++++---
 task_dFC/generate_report.py | 135 +++++++++++++++++++++++++-----------
 4 files changed, 141 insertions(+), 56 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index ea789d0..0905e65 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -128,9 +128,14 @@ def load_task_data(roi_root, subj, task, run=None, session=None):
 ################################# Feature Extraction Functions ####################################
 
 
-def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root):
+def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root, no_hrf=False):
+    """
+    Extract task features from the event data.
+
+    if no_hrf is True, the task presence will be binarized without convolving with HRF.
+    Therefore the task features will be extracted based on the event labels and
+    without the effect of HRF.
     """
-    Extract task features from the event data."""
     task_features = {
         "task": list(),
         "run": list(),
@@ -167,6 +172,7 @@ def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root):
                     TR_mri=task_data["TR_mri"],
                     binary=True,
                     binarizing_method="mean",
+                    no_hrf=no_hrf,
                 )
 
                 relative_task_on = calc_relative_task_on(task_presence)
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 4fa8f0d..1264d82 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -260,7 +260,13 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"):
 
 
 def extract_task_presence(
-    event_labels, TR_task, TR_mri, TR_array=None, binary=True, binarizing_method="median"
+    event_labels,
+    TR_task,
+    TR_mri,
+    TR_array=None,
+    binary=True,
+    binarizing_method="median",
+    no_hrf=False,
 ):
     """
     event_labels: event labels including 0 and event ids at the time each event happens
@@ -272,14 +278,19 @@ def extract_task_presence(
     It also downsamples the task presence to the time points of the dFC data
     if binary is True, the task presence is binarized using the mean of the task presence
     binarizing_method: 'median' or 'mean'
+
+    if no_hrf is True, the task presence is not convolved with HRF
     """
 
     # event_labels_all_task is all conditions together, rest vs. task times
     event_labels_all_task = np.multiply(event_labels != 0, 1)
 
-    event_labels_all_task_hrf = event_labels_conv_hrf(
-        event_labels=event_labels_all_task, TR_mri=TR_mri, TR_task=TR_task
-    )
+    if no_hrf:
+        event_labels_all_task_hrf = event_labels_all_task
+    else:
+        event_labels_all_task_hrf = event_labels_conv_hrf(
+            event_labels=event_labels_all_task, TR_mri=TR_mri, TR_task=TR_task
+        )
 
     # keep the task signal of events_hrf_0_1_ds
     if event_labels_all_task_hrf.shape[1] == 1:
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 2cfed52..b5f587f 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -25,18 +25,31 @@ def run_task_features_extraction(
     output_root,
 ):
     for session in SESSIONS:
+
+        # Extract task features without HRF effect
         task_features = extract_task_features(
             TASKS=TASKS,
             RUNS=RUNS,
             session=session,
             roi_root=roi_root,
             dFC_root=dFC_root,
+            no_hrf=True,
+        )
+
+        # Extract task features with HRF effect
+        task_features_hrf = extract_task_features(
+            TASKS=TASKS,
+            RUNS=RUNS,
+            session=session,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            no_hrf=False,
         )
 
         if session is None:
-            folder = f"{output_root}"
+            folder = f"{output_root}/task_features"
         else:
-            folder = f"{output_root}/{session}"
+            folder = f"{output_root}/task_features/{session}"
         try:
             if not os.path.exists(folder):
                 os.makedirs(folder)
@@ -45,6 +58,8 @@ def run_task_features_extraction(
         try:
             if not os.path.exists(f"{folder}/task_features.npy"):
                 np.save(f"{folder}/task_features.npy", task_features)
+            if not os.path.exists(f"{folder}/task_features_hrf.npy"):
+                np.save(f"{folder}/task_features_hrf.npy", task_features_hrf)
         except OSError as err:
             print(err)
 
@@ -104,9 +119,9 @@ def run_classification(
                     traceback.print_exc()
 
         if session is None:
-            folder = f"{output_root}"
+            folder = f"{output_root}/classification"
         else:
-            folder = f"{output_root}/{session}"
+            folder = f"{output_root}/classification/{session}"
         try:
             if not os.path.exists(folder):
                 os.makedirs(folder)
@@ -169,9 +184,9 @@ def run_clustering(
                     traceback.print_exc()
 
         if session is None:
-            folder = f"{output_root}"
+            folder = f"{output_root}/clustering"
         else:
-            folder = f"{output_root}/{session}"
+            folder = f"{output_root}/clustering/{session}"
         try:
             if not os.path.exists(folder):
                 os.makedirs(folder)
@@ -210,9 +225,9 @@ def run_task_paradigm_clustering(
             continue
 
         if session is None:
-            folder = f"{output_root}"
+            folder = f"{output_root}/task_paradigm_clstr"
         else:
-            folder = f"{output_root}/{session}"
+            folder = f"{output_root}/task_paradigm_clstr/{session}"
         try:
             if not os.path.exists(folder):
                 os.makedirs(folder)
@@ -264,7 +279,7 @@ def run_clustering_for_visual(
                     if session is None:
                         folder = f"{output_root}/centroids"
                     else:
-                        folder = f"{output_root}/{session}/centroids"
+                        folder = f"{output_root}/centroids/{session}"
                     if not os.path.exists(folder):
                         os.makedirs(folder)
 
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 6f104e2..e7d7b81 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -610,9 +610,9 @@ def plot_ML_results(
     # the ML_scores files are saved as ML_scores_classify_{dFC_id}.npy
     # find all the ML_scores files in the directory
     if session is None:
-        input_dir = f"{ML_root}"
+        input_dir = f"{ML_root}/classification"
     else:
-        input_dir = f"{ML_root}/{session}"
+        input_dir = f"{ML_root}/classification/{session}"
     ALL_ML_SCORES = os.listdir(input_dir)
     ALL_ML_SCORES = [
         score_file for score_file in ALL_ML_SCORES if "ML_scores_classify" in score_file
@@ -712,9 +712,9 @@ def plot_clustering_results(
     # the clustering_scores files are saved as clustering_scores_{dFC_id}.npy
     # find all the clustering_scores files in the directory
     if session is None:
-        input_dir = f"{ML_root}"
+        input_dir = f"{ML_root}/clustering"
     else:
-        input_dir = f"{ML_root}/{session}"
+        input_dir = f"{ML_root}/clustering/{session}"
     ALL_CLUSTERING_SCORES = os.listdir(input_dir)
     ALL_CLUSTERING_SCORES = [
         score_file
@@ -853,9 +853,9 @@ def plot_paradigm_clustering_score(
     # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
     # find all the paradigm_clustering_RESULTS files in the directory
     if session is None:
-        input_dir = f"{ML_root}"
+        input_dir = f"{ML_root}/task_paradigm_clstr"
     else:
-        input_dir = f"{ML_root}/{session}"
+        input_dir = f"{ML_root}/task_paradigm_clstr/{session}"
     ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
     ALL_PARADIGM_CLUSTERING_RESULTS = [
         result_file
@@ -964,18 +964,18 @@ def plot_paradigm_clustering_score(
     plt.close()
 
 
-def plot_clstr_visual_centroids(
+def plot_visual_clstr_centroids(
     ML_root,
     output_root,
     session=None,
 ):
     """ """
-    # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
-    # find all the paradigm_clustering_RESULTS files in the directory
+    # the centroids files are saved as centroids_{session}_{task}_{run}_{measure_name}.npy
+    # find all the centroids files in the directory
     if session is None:
-        input_dir = f"{ML_root}"
+        input_dir = f"{ML_root}/centroids"
     else:
-        input_dir = f"{ML_root}/{session}"
+        input_dir = f"{ML_root}/centroids/{session}"
 
     output_dir = f"{output_root}/group_results/visual_clustering_centroids"
 
@@ -1170,6 +1170,7 @@ def plot_task_presence_features(
 ):
     """
     Plot the task presence features for a given session and run.
+    Features for both with and without HRF are plotted.
     for comparability of tasks, pass the same run number for all tasks
     parameters:
     ----------
@@ -1180,30 +1181,47 @@ def plot_task_presence_features(
     """
     if session is None:
         task_features = np.load(
-            f"{ML_root}/task_features.npy", allow_pickle="TRUE"
+            f"{ML_root}/task_features/task_features.npy", allow_pickle="TRUE"
+        ).item()
+        task_features_hrf = np.load(
+            f"{ML_root}/task_features/task_features_hrf.npy", allow_pickle="TRUE"
         ).item()
     else:
         task_features = np.load(
-            f"{ML_root}/{session}/task_features.npy", allow_pickle="TRUE"
+            f"{ML_root}/task_features/{session}/task_features.npy", allow_pickle="TRUE"
+        ).item()
+        task_features_hrf = np.load(
+            f"{ML_root}/task_features/{session}/task_features_hrf.npy",
+            allow_pickle="TRUE",
         ).item()
 
     sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
 
     sns.set_style("darkgrid")
 
-    dataframe = pd.DataFrame(task_features)
+    task_features_df = pd.DataFrame(task_features)
+    task_features_hrf_df = pd.DataFrame(task_features_hrf)
     if run is not None:
-        dataframe = dataframe[dataframe["run"] == run]
+        task_features_df = task_features_df[task_features_df["run"] == run]
+        task_features_hrf_df = task_features_hrf_df[task_features_hrf_df["run"] == run]
 
     # FEATURES are columns in the dataframe except for 'task' and 'run'
-    FEATURES = list(dataframe.columns)
+    FEATURES = list(task_features_df.columns)
     FEATURES.remove("task")
     FEATURES.remove("run")
 
+    if session is None:
+        output_dir = f"{output_root}/group_results/task_presence_features"
+    else:
+        output_dir = f"{output_root}/group_results/task_presence_features/{session}"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
     for i, feature in enumerate(FEATURES):
         plt.figure(figsize=(10, 5))
         sns.pointplot(
-            data=dataframe,
+            data=task_features_df,
             x="task",
             y=feature,
             errorbar="sd",
@@ -1211,15 +1229,8 @@ def plot_task_presence_features(
             dodge=True,
             capsize=0.1,
         )
-        # save the figure
-        if session is None:
-            output_dir = f"{output_root}/group_results/task_presence_features"
-        else:
-            output_dir = f"{output_root}/group_results/task_presence_features/{session}"
-
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
 
+        # save the figure
         plt.savefig(
             f"{output_dir}/task_presence_features_{feature}.{save_fig_format}",
             dpi=fig_dpi,
@@ -1227,7 +1238,27 @@ def plot_task_presence_features(
             pad_inches=fig_pad,
             format=save_fig_format,
         )
+        plt.close()
 
+        plt.figure(figsize=(10, 5))
+        sns.pointplot(
+            data=task_features_hrf_df,
+            x="task",
+            y=feature,
+            errorbar="sd",
+            linestyle="none",
+            dodge=True,
+            capsize=0.1,
+        )
+
+        # save the figure
+        plt.savefig(
+            f"{output_dir}/task_presence_features_hrf_{feature}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
         plt.close()
 
 
@@ -1406,22 +1437,35 @@ def create_html_report_group_results(
             task_presence_features_dir = f"{group_dir}/task_presence_features/{session}"
         else:
             task_presence_features_dir = f"{group_dir}/task_presence_features"
-        # find all png files in the directory
-        for file_name in os.listdir(task_presence_features_dir):
-            if file_name.endswith(".png"):
-                task_presence_features_img = f"{task_presence_features_dir}/{file_name}"
-                # get the original size of the image
-                img = plt.imread(task_presence_features_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                task_presence_features_img = task_presence_features_img.replace(
-                    group_dir, "."
-                )
-                file.write(
-                    f"<img src='{task_presence_features_img}' alt='Task presence features' width='{width}' height='{img_height}'>\n"
-                )
+
+        for condition in ["with_HRF", "without_HRF"]:
+            file.write(f"<h2>{condition}</h2>\n")
+            # find all png files in the directory
+            for file_name in os.listdir(task_presence_features_dir):
+                if file_name.endswith(".png"):
+                    if (condition == "with_HRF" and "hrf" not in file_name) or (
+                        condition == "without_HRF" and "hrf" in file_name
+                    ):
+                        continue
+                    task_presence_features_img = (
+                        f"{task_presence_features_dir}/{file_name}"
+                    )
+                    # get the original size of the image
+                    img = plt.imread(task_presence_features_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    task_presence_features_img = task_presence_features_img.replace(
+                        group_dir, "."
+                    )
+                    file.write(
+                        f"<img src='{task_presence_features_img}' alt='Task presence features' width='{width}' height='{img_height}'>\n"
+                    )
+
+            file.write("<br>\n")
+
+    file.write("<br>\n")
 
     # classification results
     img_height = 300
@@ -1889,6 +1933,15 @@ def create_html_report_group_results(
             except Exception as e:
                 print(f"Error in plotting paradigm clustering scores: {e}")
 
+        try:
+            plot_visual_clstr_centroids(
+                ML_root=ML_root,
+                output_root=reports_root,
+                session=session,
+            )
+        except Exception as e:
+            print(f"Error in plotting visual clustering centroids: {e}")
+
         # try:
         #     plot_paradigm_clstr_centroids(
         #         ML_root=ML_root,

From bf3b9d434aff03a69a69fcc956012bea5bbd27e2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 4 Dec 2024 12:52:43 -0500
Subject: [PATCH 160/401] fix bug

---
 pydfc/task_utils.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 1264d82..a750494 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -192,7 +192,7 @@ def event_conv_hrf(event_signal, TR_mri, TR_task):
     return events_hrf
 
 
-def event_labels_conv_hrf(event_labels, TR_mri, TR_task):
+def event_labels_conv_hrf(event_labels, TR_mri, TR_task, no_hrf=False):
     """
     event_labels: event labels including 0 and event ids at the time each event happens
     TR_mri: TR of MRI
@@ -202,6 +202,10 @@ def event_labels_conv_hrf(event_labels, TR_mri, TR_task):
     return: event labels convolved with HRF for each event type
     the convolved event labels have the same length as the event_labels
     event type i convolved with HRF is in events_hrf[:, i-1]
+
+    events_hrf[:, 0] is the resting state
+
+    if no_hrf is True, the event labels are not convolved with HRF
     """
 
     event_labels = np.array(event_labels)
@@ -216,7 +220,10 @@ def event_labels_conv_hrf(event_labels, TR_mri, TR_task):
         event_signal = np.zeros(L)
         event_signal[event_labels[:, 0] == event_id] = 1.0
 
-        events_hrf[:, i] = event_conv_hrf(event_signal, TR_mri, TR_task)
+        if no_hrf:
+            events_hrf[:, i] = event_signal
+        else:
+            events_hrf[:, i] = event_conv_hrf(event_signal, TR_mri, TR_task)
 
     # the time points that are not in any event are considered as resting state
     events_hrf[np.sum(events_hrf[:, 1:], axis=1) == 0.0, 0] = 1.0
@@ -285,12 +292,9 @@ def extract_task_presence(
     # event_labels_all_task is all conditions together, rest vs. task times
     event_labels_all_task = np.multiply(event_labels != 0, 1)
 
-    if no_hrf:
-        event_labels_all_task_hrf = event_labels_all_task
-    else:
-        event_labels_all_task_hrf = event_labels_conv_hrf(
-            event_labels=event_labels_all_task, TR_mri=TR_mri, TR_task=TR_task
-        )
+    event_labels_all_task_hrf = event_labels_conv_hrf(
+        event_labels=event_labels_all_task, TR_mri=TR_mri, TR_task=TR_task, no_hrf=no_hrf
+    )
 
     # keep the task signal of events_hrf_0_1_ds
     if event_labels_all_task_hrf.shape[1] == 1:

From a833032ef807719d4124ee3fba7c8c23b2546290 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 4 Dec 2024 13:17:38 -0500
Subject: [PATCH 161/401] minor bug

---
 task_dFC/FCS_estimate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 2478817..c4c0eec 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -60,7 +60,7 @@ def run_FCS_estimate(
     )
     ################################ Measures of dFC #################################
 
-    MEASURES_lst = multi_analysis_utils.measures_initializer(
+    MEASURES_lst, hyper_param_info = multi_analysis_utils.measures_initializer(
         MEASURES_name_lst, params_methods, alter_hparams
     )
 

From 5f26bc647b44682017707024ba68bcd449e058a3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 5 Dec 2024 12:23:04 -0500
Subject: [PATCH 162/401] change FCS est to loop over method

---
 task_dFC/FCS_estimate.py | 67 ++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index c4c0eec..3a4299a 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -64,6 +64,9 @@ def run_FCS_estimate(
         MEASURES_name_lst, params_methods, alter_hparams
     )
 
+    # in this script we process only one measure
+    assert len(MEASURES_lst) == 1, "Only one measure should be processed in this script"
+
     tic = time.time()
     print("Measurement Started ...")
 
@@ -77,6 +80,10 @@ def run_FCS_estimate(
         backend=params_multi_analysis["backend"],
     )
 
+    assert (
+        len(MEASURES_fit_lst) == 1
+    ), "Only one measure should be processed in this script"
+
     # Save the fitted measures
     for MEASURE_id, measure in enumerate(MEASURES_fit_lst):
         if not os.path.exists(f"{output_dir}"):
@@ -114,18 +121,6 @@ def run_FCS_estimate(
 
     TASKS = dataset_info["TASKS"]
 
-    job_id = os.getenv("SGE_TASK_ID")  # for SGE
-    if job_id is None:
-        job_id = os.getenv("SLURM_ARRAY_TASK_ID")  # for SLURM
-    job_id = int(job_id)
-    TASK_id = job_id - 1  # TASK_ID starts from 1 not 0
-    if TASK_id >= len(TASKS):
-        print("TASK_id out of TASKS")
-        exit()
-    task = TASKS[TASK_id]
-
-    print(f"FCS estimation CODE started running ... for task: {task} ...")
-
     if "SESSIONS" in dataset_info:
         SESSIONS = dataset_info["SESSIONS"]
     else:
@@ -138,7 +133,7 @@ def run_FCS_estimate(
     else:
         RUNS = None
     if RUNS is None:
-        RUNS = {task: [None]}
+        RUNS = {task: [None] for task in TASKS}
 
     if "{dataset}" in dataset_info["main_root"]:
         main_root = dataset_info["main_root"].replace(
@@ -165,19 +160,37 @@ def run_FCS_estimate(
     alter_hparams = methods_config["alter_hparams"]
     params_multi_analysis = methods_config["params_multi_analysis"]
 
+    # pick one method
+    job_id = os.getenv("SGE_TASK_ID")  # for SGE
+    if job_id is None:
+        job_id = os.getenv("SLURM_ARRAY_TASK_ID")  # for SLURM
+    job_id = int(job_id)
+    MEASURE_id = job_id - 1  # job_id starts from 1 not 0
+    if MEASURE_id >= len(MEASURES_name_lst):
+        print("MEASURE_id out of MEASURES_name_lst range")
+        exit()
+    picked_measure_list = [MEASURES_name_lst[MEASURE_id]]  # pick one method but as a list
+
+    print(
+        f"FCS estimation CODE started running ... for measure: {picked_measure_list[0]} ..."
+    )
+
     for session in SESSIONS:
-        for run in RUNS[task]:
-            run_FCS_estimate(
-                params_methods=params_methods,
-                MEASURES_name_lst=MEASURES_name_lst,
-                alter_hparams=alter_hparams,
-                params_multi_analysis=params_multi_analysis,
-                task=task,
-                roi_root=roi_root,
-                output_root=fitted_measures_root,
-                session=session,
-                run=run,
-            )
-
-    print(f"FCS estimation CODE finished running ... for task: {task} ...")
+        for task in TASKS:
+            for run in RUNS[task]:
+                run_FCS_estimate(
+                    params_methods=params_methods,
+                    MEASURES_name_lst=picked_measure_list,
+                    alter_hparams=alter_hparams,
+                    params_multi_analysis=params_multi_analysis,
+                    task=task,
+                    roi_root=roi_root,
+                    output_root=fitted_measures_root,
+                    session=session,
+                    run=run,
+                )
+
+    print(
+        f"FCS estimation CODE finished running ... for measure: {picked_measure_list[0]} ..."
+    )
 #################################################################################

From 72c564b901f5f40d15db42a41669abf4d4655dea Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 5 Dec 2024 12:26:43 -0500
Subject: [PATCH 163/401] minor

---
 task_dFC/run_scripts_slurm/run_FCS.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index c0de268..11c5422 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -4,8 +4,7 @@
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
 #SBATCH --time=7-00:00:00                # Walltime for each task (7 days)
-#SBATCH --mem-per-cpu=64G                # Memory (64 GB) per cpu
-#SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
+#SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 METHODS_CONFIG="./methods_config.json"

From df1b6661a1a41e69abc2437a268af55a10b85a78 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 5 Dec 2024 13:05:53 -0500
Subject: [PATCH 164/401] minor

---
 task_dFC/FCS_estimate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 3a4299a..aafbc67 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -86,8 +86,11 @@ def run_FCS_estimate(
 
     # Save the fitted measures
     for MEASURE_id, measure in enumerate(MEASURES_fit_lst):
-        if not os.path.exists(f"{output_dir}"):
-            os.makedirs(f"{output_dir}")
+        try:
+            if not os.path.exists(f"{output_dir}"):
+                os.makedirs(f"{output_dir}")
+        except OSError as err:
+            print(err)
         np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")

From eb8c5c4d71d0911de83ec6bab55e8e9552399930 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 5 Dec 2024 13:28:44 -0500
Subject: [PATCH 165/401] fix bug

---
 task_dFC/FCS_estimate.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index aafbc67..d4ee3ff 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -66,6 +66,11 @@ def run_FCS_estimate(
 
     # in this script we process only one measure
     assert len(MEASURES_lst) == 1, "Only one measure should be processed in this script"
+    # and we assume alter_hparams is empty
+    # if not, we need to change the naming of the output files
+    assert (
+        len(hyper_param_info) == 0
+    ), "alter_hparams is assumed to be empty in this script"
 
     tic = time.time()
     print("Measurement Started ...")
@@ -85,13 +90,14 @@ def run_FCS_estimate(
     ), "Only one measure should be processed in this script"
 
     # Save the fitted measures
-    for MEASURE_id, measure in enumerate(MEASURES_fit_lst):
+    for measure in MEASURES_fit_lst:
         try:
             if not os.path.exists(f"{output_dir}"):
                 os.makedirs(f"{output_dir}")
         except OSError as err:
             print(err)
-        np.save(f"{output_dir}/MEASURE_{file_suffix}_{MEASURE_id}.npy", measure)
+        measure_name = MEASURES_name_lst[0]
+        np.save(f"{output_dir}/MEASURE_{file_suffix}_{measure_name}.npy", measure)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")
 

From 2f8d1f3c3afa48553eab0c02ebaf26b038e0feaa Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 5 Dec 2024 13:46:17 -0500
Subject: [PATCH 166/401] minor

---
 task_dFC/FCS_estimate.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index d4ee3ff..bf04818 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -68,9 +68,7 @@ def run_FCS_estimate(
     assert len(MEASURES_lst) == 1, "Only one measure should be processed in this script"
     # and we assume alter_hparams is empty
     # if not, we need to change the naming of the output files
-    assert (
-        len(hyper_param_info) == 0
-    ), "alter_hparams is assumed to be empty in this script"
+    assert len(alter_hparams) == 0, "alter_hparams is assumed to be empty in this script"
 
     tic = time.time()
     print("Measurement Started ...")

From 3ca71dbbea5a1cb24f97386e92f023d1faacaa5f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 6 Dec 2024 11:18:46 -0500
Subject: [PATCH 167/401] minor

---
 task_dFC/run_scripts_slurm/run_ML.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index f90d030..c0363c9 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -3,8 +3,8 @@
 #SBATCH --job-name=ML_job   # Optional: Name of your job
 #SBATCH --output=logs/ML_out.txt  # Standard output log
 #SBATCH --error=logs/ML_err.txt   # Standard error log
-#SBATCH --time=72:00:00                # Walltime for each task (72 hours)
-#SBATCH --mem=70G                     # Memory request per node
+#SBATCH --time=4-00:00:00                # Walltime for each task (4 days)
+#SBATCH --mem=128G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"
 

From 19b6f6cf3fb891d0074e4bd076cd452f0998f2e1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 9 Dec 2024 16:02:11 -0500
Subject: [PATCH 168/401] change in dataset-info

---
 task_dFC/nifti_to_roi_signal.py              | 4 ++--
 task_dFC/run_scripts_slurm/dataset_info.json | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index e4216a4..37cfd2a 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -276,8 +276,8 @@ def run_roi_signal_extraction(
                 output_root=output_root,
                 session=session,
                 RUNS=RUNS[task],
-                trial_type_label=trial_type_label,
-                rest_labels=rest_labels,
+                trial_type_label=trial_type_label[task],
+                rest_labels=rest_labels[task],
             )
 
     print(
diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json
index 74f4ddf..c975277 100644
--- a/task_dFC/run_scripts_slurm/dataset_info.json
+++ b/task_dFC/run_scripts_slurm/dataset_info.json
@@ -8,8 +8,6 @@
 	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
 	"ML_root" : "{main_root}/derivatives/ML",
 	"reports_root" : "{main_root}/derivatives/reports",
-	"trial_type_label" : "trial_type",
-	"rest_labels" : ["rest", "Rest"],
 	"bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz",
 	"SESSIONS" : [
 		"ses-1"
@@ -19,5 +17,11 @@
 	],
 	"RUNS" : {
     		"task-A": ["run-01", "run-02", "run-03", "run-04", "run-05", "run-06"]
+	},
+	"trial_type_label" : {
+			"task-A": "trial_type"
+	},
+	"rest_labels" : {
+			"task-A": ["rest", "Rest"]
 	}
 }

From 1e536f1c7ef6f39e7680bde5a001abacf569eed0 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 9 Dec 2024 16:27:15 -0500
Subject: [PATCH 169/401] change states to 5

---
 pydfc/ml_utils.py                              | 2 +-
 task_dFC/run_scripts_slurm/methods_config.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 0905e65..9f68065 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1469,7 +1469,7 @@ def cluster_for_visual(
 
     # clustering
     # apply kmeans clustering to dFC features
-    n_clusters = 12
+    n_clusters = 5
 
     kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
     kmeans.fit(X)
diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index ee96381..0fc455f 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -8,7 +8,7 @@
         "clstr_base_measure": "SlidingWindow",
         "hmm_iter": 20,
         "dhmm_obs_state_ratio": 0.666,
-        "n_states": 12,
+        "n_states": 5,
         "n_subj_clstrs": 10,
         "n_jobs": 2,
         "verbose": 0,

From 9452fb40efb08419240fcb2ba5acaa5773e07080 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 9 Dec 2024 20:11:15 -0500
Subject: [PATCH 170/401] fix acq data file

---
 task_dFC/nifti_to_roi_signal.py | 57 +++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 37cfd2a..52dc1c8 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -65,18 +65,30 @@ def run_roi_signal_extraction(
             task_events_root = f"{bids_root}/{subj}/{session}/func"
         info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}"
 
+        if os.path.exists(info_file):
+            f = open(info_file)
+            acquisition_data = json.load(f)
+            f.close()
+        else:
+            acquisition_data = None
+
         # in some cases the info file is common for all subjects and can be found in f"{bids_root}"
-        if not os.path.exists(info_file):
-            ALL_COMMON_FILES = os.listdir(f"{bids_root}/")
-            ALL_COMMON_FILES = [
-                file_i
-                for file_i in ALL_COMMON_FILES
-                if (f"{task}_" in file_i) and ("_bold.json" in file_i)
-            ]
-            if len(ALL_COMMON_FILES) == 1:
-                info_file = f"{bids_root}/{ALL_COMMON_FILES[0]}"
-        if not os.path.exists(info_file):
-            # if the info file is not found, exclude the subject
+        ALL_COMMON_FILES = os.listdir(f"{bids_root}/")
+        ALL_COMMON_FILES = [
+            file_i
+            for file_i in ALL_COMMON_FILES
+            if (f"{task}_" in file_i) and ("_bold.json" in file_i)
+        ]
+        if len(ALL_COMMON_FILES) == 1:
+            global_info_file = f"{bids_root}/{ALL_COMMON_FILES[0]}"
+            f = open(global_info_file)
+            global_acquisition_data = json.load(f)
+            f.close()
+        else:
+            global_acquisition_data = None
+
+        if global_acquisition_data is None and acquisition_data is None:
+            # if the acquisition_data is not found, exclude the subject
             if run is None:
                 print(f"bold.json info file not found for {subj} {session_str} {task}")
             else:
@@ -84,12 +96,23 @@ def run_roi_signal_extraction(
                     f"bold.json info file not found for {subj} {session_str} {task} {run}"
                 )
             return
-        ################################# LOAD JSON INFO #########################
-        # Opening JSON file as a dictionary
-        f = open(info_file)
-        acquisition_data = json.load(f)
-        f.close()
-        TR_mri = acquisition_data["RepetitionTime"]
+        ################################# GET REPETITION TIME #########################
+        TR_mri = None
+        # first check the acquisition_data
+        if acquisition_data is not None:
+            if "RepetitionTime" in acquisition_data:
+                TR_mri = acquisition_data["RepetitionTime"]
+        # if not found, check the global_acquisition_data
+        if TR_mri is None and global_acquisition_data is not None:
+            if "RepetitionTime" in global_acquisition_data:
+                TR_mri = global_acquisition_data["RepetitionTime"]
+        # if not found, print a warning and skip the subject
+        if TR_mri is None:
+            if run is None:
+                print(f"Repetition time not found for {subj} {session_str} {task}")
+            else:
+                print(f"Repetition time not found for {subj} {session_str} {task} {run}")
+            return
         ################################# EXTRACT TIME SERIES #########################
         # extract ROI signals and convert to TIME_SERIES object
         time_series = data_loader.nifti2timeseries(

From 7cb62060ece98cd2d31188b9232e19d42579f4fd Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 6 Jan 2025 18:12:29 -0500
Subject: [PATCH 171/401] add co-occurance matrix to ML

---
 pydfc/ml_utils.py           |  51 ++++++++++++++-
 task_dFC/ML.py              |  17 ++++-
 task_dFC/generate_report.py | 125 +++++++++++++++++++++++++++++++++++-
 3 files changed, 186 insertions(+), 7 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 9f68065..4333fbb 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1434,6 +1434,40 @@ def task_paradigm_clustering(
     return task_paradigm_clstr_RESULTS
 
 
+def co_occurrence(task_labels, clstr_labels):
+    """
+    Calculate the co-occurrence between task labels and clustering labels.
+    """
+    co_occurrence_matrix = np.zeros(
+        (len(np.unique(task_labels)), len(np.unique(clstr_labels)))
+    )
+    for i, task_label in enumerate(np.unique(task_labels)):
+        for j, clstr_label in enumerate(np.unique(clstr_labels)):
+            co_occurrence_matrix[i, j] = np.sum(
+                (task_labels == task_label) & (clstr_labels == clstr_label)
+            )
+
+    # now find the percentage of time each cluster label was present in each task label
+    cluster_label_percentage = (
+        co_occurrence_matrix / np.sum(co_occurrence_matrix, axis=1)[:, None]
+    )
+    # make sure that the sum of each row is 1
+    assert np.allclose(
+        np.sum(cluster_label_percentage, axis=1), 1
+    ), "Sum of each row is not 1."
+
+    # now find the percentage of time each task label occupied each cluster label
+    task_label_percentage = (
+        co_occurrence_matrix / np.sum(co_occurrence_matrix, axis=0)[None, :]
+    )
+    # make sure that the sum of each column is 1
+    assert np.allclose(
+        np.sum(task_label_percentage, axis=0), 1
+    ), "Sum of each column is not 1."
+
+    return co_occurrence_matrix, cluster_label_percentage, task_label_percentage
+
+
 def cluster_for_visual(
     task,
     dFC_id,
@@ -1454,7 +1488,7 @@ def cluster_for_visual(
 
     print(f"Number of subjects: {len(SUBJECTS)}")
 
-    X, _, _, _, _, _, measure_name = dFC_feature_extraction(
+    X, _, y, _, _, _, measure_name = dFC_feature_extraction(
         task=task,
         train_subjects=SUBJECTS,
         test_subjects=[],
@@ -1472,7 +1506,12 @@ def cluster_for_visual(
     n_clusters = 5
 
     kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-    kmeans.fit(X)
+    clstr_labels = kmeans.fit_predict(X)  # clstr_labels = (n_samples,)
+
+    # calculate the co-occurrence matrix
+    co_occurrence_matrix, cluster_label_percentage, task_label_percentage = co_occurrence(
+        y, clstr_labels
+    )
 
     # get centroids
     centroids = kmeans.cluster_centers_
@@ -1481,4 +1520,10 @@ def cluster_for_visual(
         centroids, n_regions
     )  # shape: n_clusters x n_regions x n_regions
 
-    return centroids_mat, measure_name
+    return (
+        centroids_mat,
+        measure_name,
+        co_occurrence_matrix,
+        cluster_label_percentage,
+        task_label_percentage,
+    )
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index b5f587f..a45f3ea 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -257,7 +257,13 @@ def run_clustering_for_visual(
         for task_id, task in enumerate(TASKS):
             for run in RUNS[task]:
                 try:
-                    centroids_mat, measure_name = cluster_for_visual(
+                    (
+                        centroids_mat,
+                        measure_name,
+                        co_occurrence_matrix,
+                        cluster_label_percentage,
+                        task_label_percentage,
+                    ) = cluster_for_visual(
                         task=task,
                         dFC_id=dFC_id,
                         roi_root=roi_root,
@@ -267,6 +273,13 @@ def run_clustering_for_visual(
                         normalize_dFC=normalize_dFC,
                     )
 
+                    centroids = {
+                        "centroids_mat": centroids_mat,
+                        "co_occurrence_matrix": co_occurrence_matrix,
+                        "cluster_label_percentage": cluster_label_percentage,
+                        "task_label_percentage": task_label_percentage,
+                    }
+
                     # save the centroids
                     suffix = "centroids"
                     if session is not None:
@@ -285,7 +298,7 @@ def run_clustering_for_visual(
 
                     np.save(
                         f"{folder}/{suffix}.npy",
-                        centroids_mat,
+                        centroids,
                     )
 
                 except Exception as e:
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index e7d7b81..84fdde6 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -989,14 +989,20 @@ def plot_visual_clstr_centroids(
     ALL_CENTROID_RESULTS.sort()
 
     for result_file in ALL_CENTROID_RESULTS:
-        centroids_mats = np.load(f"{input_dir}/{result_file}", allow_pickle="TRUE")
+        centroids_results = np.load(
+            f"{input_dir}/{result_file}", allow_pickle="TRUE"
+        ).item()
+        centroids_mat = centroids_results["centroids_mat"]
+        co_occurrence_matrix = centroids_results["co_occurrence_matrix"]
+        cluster_label_percentage = centroids_results["cluster_label_percentage"]
+        task_label_percentage = centroids_results["task_label_percentage"]
 
         # result_file is centroids_{session}_{task}_{run}_{measure_name}.npy
         # suffix is whatever comes after the centroids and before .npy
         suffix = result_file.split("centroids_")[1].split(".npy")[0]
 
         centroids_dict = {}
-        for i, centroid_mat in enumerate(centroids_mats):
+        for i, centroid_mat in enumerate(centroids_mat):
             centroids_dict[f"Cluster {i + 1}"] = centroid_mat
 
         visualize_conn_mat_dict(
@@ -1011,6 +1017,73 @@ def plot_visual_clstr_centroids(
             # node_networks=None,
         )
 
+        # plot co-occurrence matrix and cluster label percentage and task label percentage
+        # as a seaborn heatmap with numbers in the cells
+        # as separate figures
+
+        # plot co-occurrence matrix
+        plt.figure(figsize=(10, 5))
+        sns.heatmap(
+            co_occurrence_matrix,
+            annot=True,
+            fmt=".2f",
+            cmap="coolwarm",
+            cbar_kws={"label": "Co-occurrence"},
+        )
+        plt.title("Co-occurrence matrix")
+        plt.xlabel("Cluster")
+        plt.ylabel("Task")
+        plt.savefig(
+            f"{output_dir}/co-occurrence-matrix_{suffix}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+        plt.close()
+
+        # plot cluster label percentage
+        plt.figure(figsize=(10, 5))
+        sns.heatmap(
+            cluster_label_percentage,
+            annot=True,
+            fmt=".2f",
+            cmap="coolwarm",
+            cbar_kws={"label": "Percentage"},
+        )
+        plt.title("Cluster label percentage")
+        plt.xlabel("Cluster")
+        plt.ylabel("Task")
+        plt.savefig(
+            f"{output_dir}/cluster-label-percentage_{suffix}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+        plt.close()
+
+        # plot task label percentage
+        plt.figure(figsize=(10, 5))
+        sns.heatmap(
+            task_label_percentage,
+            annot=True,
+            fmt=".2f",
+            cmap="coolwarm",
+            cbar_kws={"label": "Percentage"},
+        )
+        plt.title("Task label percentage")
+        plt.xlabel("Cluster")
+        plt.ylabel("Task")
+        plt.savefig(
+            f"{output_dir}/task-label-percentage_{suffix}.{save_fig_format}",
+            dpi=fig_dpi,
+            bbox_inches=fig_bbox_inches,
+            pad_inches=fig_pad,
+            format=save_fig_format,
+        )
+        plt.close()
+
 
 # def plot_paradigm_clstr_centroids(
 #     ML_root,
@@ -1670,6 +1743,54 @@ def create_html_report_group_results(
                         file.write(
                             f"<img src='{centroid_img}' alt='Visual clustering centroids' width='{width}' height='{img_height}'>\n"
                         )
+
+                        # visual-centroids_{suffix}.png
+                        suffix = centroids_img_file[
+                            centroids_img_file.find("visual-centroids_") + 17 : -4
+                        ]
+
+                        # display co-occurrence matrix
+                        co_occurrence_matrix_img = f"{visual_clustering_centroids_dir}/co-occurrence-matrix_{suffix}.png"
+                        img = plt.imread(co_occurrence_matrix_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        co_occurrence_matrix_img = co_occurrence_matrix_img.replace(
+                            group_dir, "."
+                        )
+                        file.write(
+                            f"<img src='{co_occurrence_matrix_img}' alt='Co-occurrence matrix' width='{width}' height='{img_height}'>\n"
+                        )
+
+                        # display cluster label percentage
+                        cluster_label_percentage_img = f"{visual_clustering_centroids_dir}/cluster-label-percentage_{suffix}.png"
+                        img = plt.imread(cluster_label_percentage_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        cluster_label_percentage_img = (
+                            cluster_label_percentage_img.replace(group_dir, ".")
+                        )
+                        file.write(
+                            f"<img src='{cluster_label_percentage_img}' alt='Cluster label percentage' width='{width}' height='{img_height}'>\n"
+                        )
+
+                        # display task label percentage
+                        task_label_percentage_img = f"{visual_clustering_centroids_dir}/task-label-percentage_{suffix}.png"
+                        img = plt.imread(task_label_percentage_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        task_label_percentage_img = task_label_percentage_img.replace(
+                            group_dir, "."
+                        )
+                        file.write(
+                            f"<img src='{task_label_percentage_img}' alt='Task label percentage' width='{width}' height='{img_height}'>\n"
+                        )
+
                         file.write("<br>\n")
 
         # # display paradigm clustering centroids

From 14181d4b1243a4b03fa79667c9549b90253098a0 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 9 Jan 2025 12:52:31 -0500
Subject: [PATCH 172/401] improve report

---
 task_dFC/generate_report.py | 244 ++++++++++++++++++------------------
 1 file changed, 125 insertions(+), 119 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 84fdde6..e177391 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1022,12 +1022,12 @@ def plot_visual_clstr_centroids(
         # as separate figures
 
         # plot co-occurrence matrix
-        plt.figure(figsize=(10, 5))
+        plt.figure(figsize=(20, 10))
         sns.heatmap(
             co_occurrence_matrix,
             annot=True,
-            fmt=".2f",
-            cmap="coolwarm",
+            fmt=".0f",
+            cmap="Reds",
             cbar_kws={"label": "Co-occurrence"},
         )
         plt.title("Co-occurrence matrix")
@@ -1043,12 +1043,12 @@ def plot_visual_clstr_centroids(
         plt.close()
 
         # plot cluster label percentage
-        plt.figure(figsize=(10, 5))
+        plt.figure(figsize=(20, 10))
         sns.heatmap(
             cluster_label_percentage,
             annot=True,
             fmt=".2f",
-            cmap="coolwarm",
+            cmap="Reds",
             cbar_kws={"label": "Percentage"},
         )
         plt.title("Cluster label percentage")
@@ -1064,12 +1064,12 @@ def plot_visual_clstr_centroids(
         plt.close()
 
         # plot task label percentage
-        plt.figure(figsize=(10, 5))
+        plt.figure(figsize=(20, 10))
         sns.heatmap(
             task_label_percentage,
             annot=True,
             fmt=".2f",
-            cmap="coolwarm",
+            cmap="Reds",
             cbar_kws={"label": "Percentage"},
         )
         plt.title("Task label percentage")
@@ -1660,15 +1660,18 @@ def create_html_report_group_results(
         for embedding in ["PCA", "LE"]:
             file.write(f"<h3>{embedding}</h3>\n")
             paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI_{embedding}.png"
-            img = plt.imread(paradigm_clustering_img)
-            height, width, _ = img.shape
-            # change the width so that height equals img_height
-            width = int(width * img_height / height)
-            # replace the path to the image with a relative path
-            paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
-            file.write(
-                f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
-            )
+            try:
+                img = plt.imread(paradigm_clustering_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
+                file.write(
+                    f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
+                )
+            except e:
+                print(f"Error: {e}")
 
             file.write("<br>\n")
 
@@ -1678,120 +1681,123 @@ def create_html_report_group_results(
         for embedding in ["PCA", "LE"]:
             file.write(f"<h3>{embedding}</h3>\n")
             paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_SI_{embedding}.png"
-            img = plt.imread(paradigm_clustering_img)
-            height, width, _ = img.shape
-            # change the width so that height equals img_height
-            width = int(width * img_height / height)
-            # replace the path to the image with a relative path
-            paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
-            file.write(
-                f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
-            )
+            try:
+                img = plt.imread(paradigm_clustering_img)
+                height, width, _ = img.shape
+                # change the width so that height equals img_height
+                width = int(width * img_height / height)
+                # replace the path to the image with a relative path
+                paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
+                file.write(
+                    f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
+                )
+            except e:
+                print(f"Error: {e}")
 
             file.write("<br>\n")
 
-        # display visual clustering centroids
-        img_height = 300
-        file.write("<h2>Visual Clustering Centroids</h2>\n")
-        # find all png files in the directory
-        visual_clustering_centroids_dir = f"{group_dir}/visual_clustering_centroids"
-        for session in SESSIONS:
-            if session is not None:
-                file.write(f"<h3> {session} </h3>\n")
-            for task in TASKS:
-                file.write(f"<h3> {task} </h3>\n")
-                for run in RUNS[task]:
-                    if run is not None:
-                        file.write(f"<h3> {run} </h3>\n")
-
-                    # visual-centroids_{session}_{task}_{run}_{measure_name}.png
-                    all_centroids_img_files = os.listdir(visual_clustering_centroids_dir)
+    # display visual clustering centroids
+    img_height = 300
+    file.write("<h1>Visual Clustering Centroids</h2>\n")
+    # find all png files in the directory
+    visual_clustering_centroids_dir = f"{group_dir}/visual_clustering_centroids"
+    for session in SESSIONS:
+        if session is not None:
+            file.write(f"<h3> {session} </h3>\n")
+        for task in TASKS:
+            file.write(f"<h3> {task} </h3>\n")
+            for run in RUNS[task]:
+                if run is not None:
+                    file.write(f"<h3> {run} </h3>\n")
+
+                # visual-centroids_{session}_{task}_{run}_{measure_name}.png
+                all_centroids_img_files = os.listdir(visual_clustering_centroids_dir)
+                all_centroids_img_files = [
+                    centroids_img_file
+                    for centroids_img_file in all_centroids_img_files
+                    if "visual-centroids" in centroids_img_file
+                    and f"_{task}" in centroids_img_file
+                ]
+                if session is not None:
                     all_centroids_img_files = [
                         centroids_img_file
                         for centroids_img_file in all_centroids_img_files
-                        if "visual-centroids" in centroids_img_file
-                        and f"_{task}" in centroids_img_file
+                        if f"_{session}" in centroids_img_file
                     ]
-                    if session is not None:
-                        all_centroids_img_files = [
-                            centroids_img_file
-                            for centroids_img_file in all_centroids_img_files
-                            if f"_{session}" in centroids_img_file
-                        ]
-                    if run is not None:
-                        all_centroids_img_files = [
-                            centroids_img_file
-                            for centroids_img_file in all_centroids_img_files
-                            if f"_{run}" in centroids_img_file
-                        ]
-                    all_centroids_img_files.sort()
-
-                    for centroids_img_file in all_centroids_img_files:
-                        # iterate over centroids images of different measures
-                        centroid_img = (
-                            f"{visual_clustering_centroids_dir}/{centroids_img_file}"
-                        )
-                        measure_name = centroids_img_file.split("_")[-1].split(".")[0]
-                        file.write(f"<h3>{measure_name}</h3>\n")
-                        # get the original size of the image
-                        img = plt.imread(centroid_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        centroid_img = centroid_img.replace(group_dir, ".")
-                        file.write(
-                            f"<img src='{centroid_img}' alt='Visual clustering centroids' width='{width}' height='{img_height}'>\n"
-                        )
+                if run is not None:
+                    all_centroids_img_files = [
+                        centroids_img_file
+                        for centroids_img_file in all_centroids_img_files
+                        if f"_{run}" in centroids_img_file
+                    ]
+                all_centroids_img_files.sort()
 
-                        # visual-centroids_{suffix}.png
-                        suffix = centroids_img_file[
-                            centroids_img_file.find("visual-centroids_") + 17 : -4
-                        ]
-
-                        # display co-occurrence matrix
-                        co_occurrence_matrix_img = f"{visual_clustering_centroids_dir}/co-occurrence-matrix_{suffix}.png"
-                        img = plt.imread(co_occurrence_matrix_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        co_occurrence_matrix_img = co_occurrence_matrix_img.replace(
-                            group_dir, "."
-                        )
-                        file.write(
-                            f"<img src='{co_occurrence_matrix_img}' alt='Co-occurrence matrix' width='{width}' height='{img_height}'>\n"
-                        )
+                for centroids_img_file in all_centroids_img_files:
+                    # iterate over centroids images of different measures
+                    centroid_img = (
+                        f"{visual_clustering_centroids_dir}/{centroids_img_file}"
+                    )
+                    measure_name = centroids_img_file.split("_")[-1].split(".")[0]
+                    file.write(f"<h3>{measure_name}</h3>\n")
+                    # get the original size of the image
+                    img = plt.imread(centroid_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    centroid_img = centroid_img.replace(group_dir, ".")
+                    file.write(
+                        f"<img src='{centroid_img}' alt='Visual clustering centroids' width='{width}' height='{img_height}'>\n"
+                    )
 
-                        # display cluster label percentage
-                        cluster_label_percentage_img = f"{visual_clustering_centroids_dir}/cluster-label-percentage_{suffix}.png"
-                        img = plt.imread(cluster_label_percentage_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        cluster_label_percentage_img = (
-                            cluster_label_percentage_img.replace(group_dir, ".")
-                        )
-                        file.write(
-                            f"<img src='{cluster_label_percentage_img}' alt='Cluster label percentage' width='{width}' height='{img_height}'>\n"
-                        )
+                    # visual-centroids_{suffix}.png
+                    suffix = centroids_img_file[
+                        centroids_img_file.find("visual-centroids_") + 17 : -4
+                    ]
 
-                        # display task label percentage
-                        task_label_percentage_img = f"{visual_clustering_centroids_dir}/task-label-percentage_{suffix}.png"
-                        img = plt.imread(task_label_percentage_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        task_label_percentage_img = task_label_percentage_img.replace(
-                            group_dir, "."
-                        )
-                        file.write(
-                            f"<img src='{task_label_percentage_img}' alt='Task label percentage' width='{width}' height='{img_height}'>\n"
-                        )
+                    # display co-occurrence matrix
+                    co_occurrence_matrix_img = f"{visual_clustering_centroids_dir}/co-occurrence-matrix_{suffix}.png"
+                    img = plt.imread(co_occurrence_matrix_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    co_occurrence_matrix_img = co_occurrence_matrix_img.replace(
+                        group_dir, "."
+                    )
+                    file.write(
+                        f"<img src='{co_occurrence_matrix_img}' alt='Co-occurrence matrix' width='{width}' height='{img_height}'>\n"
+                    )
+
+                    # display cluster label percentage
+                    cluster_label_percentage_img = f"{visual_clustering_centroids_dir}/cluster-label-percentage_{suffix}.png"
+                    img = plt.imread(cluster_label_percentage_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    cluster_label_percentage_img = cluster_label_percentage_img.replace(
+                        group_dir, "."
+                    )
+                    file.write(
+                        f"<img src='{cluster_label_percentage_img}' alt='Cluster label percentage' width='{width}' height='{img_height}'>\n"
+                    )
 
-                        file.write("<br>\n")
+                    # display task label percentage
+                    task_label_percentage_img = f"{visual_clustering_centroids_dir}/task-label-percentage_{suffix}.png"
+                    img = plt.imread(task_label_percentage_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    task_label_percentage_img = task_label_percentage_img.replace(
+                        group_dir, "."
+                    )
+                    file.write(
+                        f"<img src='{task_label_percentage_img}' alt='Task label percentage' width='{width}' height='{img_height}'>\n"
+                    )
+
+                    file.write("<br>\n")
 
         # # display paradigm clustering centroids
         # img_height = 300

From c58f310ff537a2a0e3cd238d73181b21a1d7fd33 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 9 Jan 2025 16:02:24 -0500
Subject: [PATCH 173/401] minor bug

---
 task_dFC/generate_report.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index e177391..f575589 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1670,7 +1670,7 @@ def create_html_report_group_results(
                 file.write(
                     f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
                 )
-            except e:
+            except Exception as e:
                 print(f"Error: {e}")
 
             file.write("<br>\n")
@@ -1691,7 +1691,7 @@ def create_html_report_group_results(
                 file.write(
                     f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
                 )
-            except e:
+            except Exception as e:
                 print(f"Error: {e}")
 
             file.write("<br>\n")

From 28069e0a08ffecbc492e9741ffd91598e13c9d3d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 9 Jan 2025 16:10:11 -0500
Subject: [PATCH 174/401] minor change

---
 task_dFC/generate_report.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index f575589..59d10db 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1912,9 +1912,8 @@ def create_html_report_group_results(
 
     print("Generating report...")
 
-    # Generate report only 5 random subjects
-    # SUBJECTS = np.random.choice(SUBJECTS, 5)
-    SUBJECTS = SUBJECTS[:1]
+    # Generate report only 3 subjects
+    SUBJECTS = SUBJECTS[:3]
 
     start_time = 0
     end_time = 200

From 954d17bb2e7201181a96015c07918d4014e609a9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 9 Jan 2025 17:05:39 -0500
Subject: [PATCH 175/401] improve report

---
 task_dFC/generate_report.py | 258 +++++++++++++++++++-----------------
 1 file changed, 138 insertions(+), 120 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 59d10db..87154a1 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1029,6 +1029,8 @@ def plot_visual_clstr_centroids(
             fmt=".0f",
             cmap="Reds",
             cbar_kws={"label": "Co-occurrence"},
+            yticklabels=["rest", "task"],
+            xticklabels=[str(i + 1) for i in range(co_occurrence_matrix.shape[1])],
         )
         plt.title("Co-occurrence matrix")
         plt.xlabel("Cluster")
@@ -1050,6 +1052,8 @@ def plot_visual_clstr_centroids(
             fmt=".2f",
             cmap="Reds",
             cbar_kws={"label": "Percentage"},
+            yticklabels=["rest", "task"],
+            xticklabels=[str(i + 1) for i in range(co_occurrence_matrix.shape[1])],
         )
         plt.title("Cluster label percentage")
         plt.xlabel("Cluster")
@@ -1071,6 +1075,8 @@ def plot_visual_clstr_centroids(
             fmt=".2f",
             cmap="Reds",
             cbar_kws={"label": "Percentage"},
+            yticklabels=["rest", "task"],
+            xticklabels=[str(i + 1) for i in range(co_occurrence_matrix.shape[1])],
         )
         plt.title("Task label percentage")
         plt.xlabel("Cluster")
@@ -1375,61 +1381,65 @@ def create_html_report_subj_results(
 
                 # display GLM
                 glm_img = f"{subj_dir}/GLM/{session_task_run_dir}/glm.png"
-                img = plt.imread(glm_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                glm_img = glm_img.replace(subj_dir, ".")
-                file.write(
-                    f"<img src='{glm_img}' alt='GLM' width='{width}' height='{img_height}'>\n"
-                )
-                file.write("<br>\n")
+                if os.path.exists(glm_img):
+                    img = plt.imread(glm_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    glm_img = glm_img.replace(subj_dir, ".")
+                    file.write(
+                        f"<img src='{glm_img}' alt='GLM' width='{width}' height='{img_height}'>\n"
+                    )
+                    file.write("<br>\n")
 
                 # display ROI signals
                 ROI_signals_img = (
                     f"{subj_dir}/ROI_signals/{session_task_run_dir}/ROI_signals.png"
                 )
-                img = plt.imread(ROI_signals_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                ROI_signals_img = ROI_signals_img.replace(subj_dir, ".")
-                file.write(
-                    f"<img src='{ROI_signals_img}' alt='ROI signals' width='{width}' height='{img_height}'>\n"
-                )
-                file.write("<br>\n")
+                if os.path.exists(ROI_signals_img):
+                    img = plt.imread(ROI_signals_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    ROI_signals_img = ROI_signals_img.replace(subj_dir, ".")
+                    file.write(
+                        f"<img src='{ROI_signals_img}' alt='ROI signals' width='{width}' height='{img_height}'>\n"
+                    )
+                    file.write("<br>\n")
 
                 # display event labels
                 event_labels_img = (
                     f"{subj_dir}/event_labels/{session_task_run_dir}/event_labels.png"
                 )
-                img = plt.imread(event_labels_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                event_labels_img = event_labels_img.replace(subj_dir, ".")
-                file.write(
-                    f"<img src='{event_labels_img}' alt='Event labels' width='{width}' height='{img_height}'>\n"
-                )
-                file.write("<br>\n")
+                if os.path.exists(event_labels_img):
+                    img = plt.imread(event_labels_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    event_labels_img = event_labels_img.replace(subj_dir, ".")
+                    file.write(
+                        f"<img src='{event_labels_img}' alt='Event labels' width='{width}' height='{img_height}'>\n"
+                    )
+                    file.write("<br>\n")
 
                 # display task presence
                 task_presence_img = (
                     f"{subj_dir}/task_presence/{session_task_run_dir}/task_presence.png"
                 )
-                img = plt.imread(task_presence_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                task_presence_img = task_presence_img.replace(subj_dir, ".")
-                file.write(
-                    f"<img src='{task_presence_img}' alt='Task presence' width='{width}' height='{img_height}'>\n"
-                )
-                file.write("<br>\n")
+                if os.path.exists(task_presence_img):
+                    img = plt.imread(task_presence_img)
+                    height, width, _ = img.shape
+                    # change the width so that height equals img_height
+                    width = int(width * img_height / height)
+                    # replace the path to the image with a relative path
+                    task_presence_img = task_presence_img.replace(subj_dir, ".")
+                    file.write(
+                        f"<img src='{task_presence_img}' alt='Task presence' width='{width}' height='{img_height}'>\n"
+                    )
+                    file.write("<br>\n")
 
                 # display dFC matrices
                 img_height = 45
@@ -1564,15 +1574,16 @@ def create_html_report_group_results(
                         classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{embedding}.png"
                     else:
                         classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{run}_{embedding}.png"
-                    img = plt.imread(classification_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    classification_img = classification_img.replace(group_dir, ".")
-                    file.write(
-                        f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(classification_img):
+                        img = plt.imread(classification_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        classification_img = classification_img.replace(group_dir, ".")
+                        file.write(
+                            f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
+                        )
 
                     # display Logistic regression classification results
                     file.write("<h3>Logistic Regression</h3>\n")
@@ -1580,15 +1591,16 @@ def create_html_report_group_results(
                         classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{embedding}.png"
                     else:
                         classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}_{embedding}.png"
-                    img = plt.imread(classification_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    classification_img = classification_img.replace(group_dir, ".")
-                    file.write(
-                        f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(classification_img):
+                        img = plt.imread(classification_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        classification_img = classification_img.replace(group_dir, ".")
+                        file.write(
+                            f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
+                        )
 
                     file.write("<br>\n")
 
@@ -1615,34 +1627,36 @@ def create_html_report_group_results(
                         clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{embedding}.png"
                     else:
                         clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{run}_{embedding}.png"
-                    img = plt.imread(clustering_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    clustering_img = clustering_img.replace(group_dir, ".")
-                    file.write(
-                        f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(clustering_img):
+                        img = plt.imread(clustering_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        clustering_img = clustering_img.replace(group_dir, ".")
+                        file.write(
+                            f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
+                        )
 
-                    file.write("<br>\n")
+                        file.write("<br>\n")
 
                     # display clustering SI results
                     if run is None:
                         clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{embedding}.png"
                     else:
                         clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{run}_{embedding}.png"
-                    img = plt.imread(clustering_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    clustering_img = clustering_img.replace(group_dir, ".")
-                    file.write(
-                        f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(clustering_img):
+                        img = plt.imread(clustering_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        clustering_img = clustering_img.replace(group_dir, ".")
+                        file.write(
+                            f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
+                        )
 
-                    file.write("<br>\n")
+                        file.write("<br>\n")
 
     # paradigm clustering results
     file.write("<h1>Paradigm Clustering Results</h1>\n")
@@ -1740,15 +1754,16 @@ def create_html_report_group_results(
                     measure_name = centroids_img_file.split("_")[-1].split(".")[0]
                     file.write(f"<h3>{measure_name}</h3>\n")
                     # get the original size of the image
-                    img = plt.imread(centroid_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    centroid_img = centroid_img.replace(group_dir, ".")
-                    file.write(
-                        f"<img src='{centroid_img}' alt='Visual clustering centroids' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(centroid_img):
+                        img = plt.imread(centroid_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        centroid_img = centroid_img.replace(group_dir, ".")
+                        file.write(
+                            f"<img src='{centroid_img}' alt='Visual clustering centroids' width='{width}' height='{img_height}'>\n"
+                        )
 
                     # visual-centroids_{suffix}.png
                     suffix = centroids_img_file[
@@ -1757,45 +1772,48 @@ def create_html_report_group_results(
 
                     # display co-occurrence matrix
                     co_occurrence_matrix_img = f"{visual_clustering_centroids_dir}/co-occurrence-matrix_{suffix}.png"
-                    img = plt.imread(co_occurrence_matrix_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    co_occurrence_matrix_img = co_occurrence_matrix_img.replace(
-                        group_dir, "."
-                    )
-                    file.write(
-                        f"<img src='{co_occurrence_matrix_img}' alt='Co-occurrence matrix' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(co_occurrence_matrix_img):
+                        img = plt.imread(co_occurrence_matrix_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        co_occurrence_matrix_img = co_occurrence_matrix_img.replace(
+                            group_dir, "."
+                        )
+                        file.write(
+                            f"<img src='{co_occurrence_matrix_img}' alt='Co-occurrence matrix' width='{width}' height='{img_height}'>\n"
+                        )
 
                     # display cluster label percentage
                     cluster_label_percentage_img = f"{visual_clustering_centroids_dir}/cluster-label-percentage_{suffix}.png"
-                    img = plt.imread(cluster_label_percentage_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    cluster_label_percentage_img = cluster_label_percentage_img.replace(
-                        group_dir, "."
-                    )
-                    file.write(
-                        f"<img src='{cluster_label_percentage_img}' alt='Cluster label percentage' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(cluster_label_percentage_img):
+                        img = plt.imread(cluster_label_percentage_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        cluster_label_percentage_img = (
+                            cluster_label_percentage_img.replace(group_dir, ".")
+                        )
+                        file.write(
+                            f"<img src='{cluster_label_percentage_img}' alt='Cluster label percentage' width='{width}' height='{img_height}'>\n"
+                        )
 
                     # display task label percentage
                     task_label_percentage_img = f"{visual_clustering_centroids_dir}/task-label-percentage_{suffix}.png"
-                    img = plt.imread(task_label_percentage_img)
-                    height, width, _ = img.shape
-                    # change the width so that height equals img_height
-                    width = int(width * img_height / height)
-                    # replace the path to the image with a relative path
-                    task_label_percentage_img = task_label_percentage_img.replace(
-                        group_dir, "."
-                    )
-                    file.write(
-                        f"<img src='{task_label_percentage_img}' alt='Task label percentage' width='{width}' height='{img_height}'>\n"
-                    )
+                    if os.path.exists(task_label_percentage_img):
+                        img = plt.imread(task_label_percentage_img)
+                        height, width, _ = img.shape
+                        # change the width so that height equals img_height
+                        width = int(width * img_height / height)
+                        # replace the path to the image with a relative path
+                        task_label_percentage_img = task_label_percentage_img.replace(
+                            group_dir, "."
+                        )
+                        file.write(
+                            f"<img src='{task_label_percentage_img}' alt='Task label percentage' width='{width}' height='{img_height}'>\n"
+                        )
 
                     file.write("<br>\n")
 

From d0b7ebbf1cf8e440f910f8ccc20474a30243c6df Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 9 Jan 2025 23:04:46 -0500
Subject: [PATCH 176/401] change font in report to bold

---
 task_dFC/generate_report.py | 102 +++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 2 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 87154a1..f8b360b 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -651,6 +651,20 @@ def plot_ML_results(
         dodge=True,
         capsize=0.1,
     )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+    plt.legend(
+        title=g.legend_.get_title().get_text(),
+        title_fontsize="13",
+        title_fontweight="bold",
+        fontsize="11",
+        loc="best",
+        frameon=True,
+    )
+    for text in g.legend_.get_texts():
+        text.set_fontweight("bold")
     g.axhline(0.5, color="r", linestyle="--")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
@@ -755,6 +769,20 @@ def plot_clustering_results(
         dodge=True,
         capsize=0.1,
     )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+    plt.legend(
+        title=g.legend_.get_title().get_text(),
+        title_fontsize="13",
+        title_fontweight="bold",
+        fontsize="11",
+        loc="best",
+        frameon=True,
+    )
+    for text in g.legend_.get_texts():
+        text.set_fontweight("bold")
     g.axhline(0.0, color="r", linestyle="--")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
@@ -800,6 +828,20 @@ def plot_clustering_results(
         dodge=True,
         capsize=0.1,
     )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+    plt.legend(
+        title=g.legend_.get_title().get_text(),
+        title_fontsize="13",
+        title_fontweight="bold",
+        fontsize="11",
+        loc="best",
+        frameon=True,
+    )
+    for text in g.legend_.get_texts():
+        text.set_fontweight("bold")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
     if show_title:
@@ -898,6 +940,20 @@ def plot_paradigm_clustering_score(
         dodge=True,
         capsize=0.1,
     )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+    plt.legend(
+        title=g.legend_.get_title().get_text(),
+        title_fontsize="13",
+        title_fontweight="bold",
+        fontsize="11",
+        loc="best",
+        frameon=True,
+    )
+    for text in g.legend_.get_texts():
+        text.set_fontweight("bold")
     g.axhline(0.0, color="r", linestyle="--")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
@@ -936,6 +992,20 @@ def plot_paradigm_clustering_score(
         dodge=True,
         capsize=0.1,
     )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+    plt.legend(
+        title=g.legend_.get_title().get_text(),
+        title_fontsize="13",
+        title_fontweight="bold",
+        fontsize="11",
+        loc="best",
+        frameon=True,
+    )
+    for text in g.legend_.get_texts():
+        text.set_fontweight("bold")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
     if show_title:
@@ -1299,7 +1369,7 @@ def plot_task_presence_features(
 
     for i, feature in enumerate(FEATURES):
         plt.figure(figsize=(10, 5))
-        sns.pointplot(
+        g = sns.pointplot(
             data=task_features_df,
             x="task",
             y=feature,
@@ -1308,6 +1378,20 @@ def plot_task_presence_features(
             dodge=True,
             capsize=0.1,
         )
+        plt.xlabel(g.get_xlabel(), fontweight="bold")
+        plt.ylabel(g.get_ylabel(), fontweight="bold")
+        plt.xticks(fontweight="bold")
+        plt.yticks(fontweight="bold")
+        plt.legend(
+            title=g.legend_.get_title().get_text(),
+            title_fontsize="13",
+            title_fontweight="bold",
+            fontsize="11",
+            loc="best",
+            frameon=True,
+        )
+        for text in g.legend_.get_texts():
+            text.set_fontweight("bold")
 
         # save the figure
         plt.savefig(
@@ -1320,7 +1404,7 @@ def plot_task_presence_features(
         plt.close()
 
         plt.figure(figsize=(10, 5))
-        sns.pointplot(
+        g = sns.pointplot(
             data=task_features_hrf_df,
             x="task",
             y=feature,
@@ -1329,6 +1413,20 @@ def plot_task_presence_features(
             dodge=True,
             capsize=0.1,
         )
+        plt.xlabel(g.get_xlabel(), fontweight="bold")
+        plt.ylabel(g.get_ylabel(), fontweight="bold")
+        plt.xticks(fontweight="bold")
+        plt.yticks(fontweight="bold")
+        plt.legend(
+            title=g.legend_.get_title().get_text(),
+            title_fontsize="13",
+            title_fontweight="bold",
+            fontsize="11",
+            loc="best",
+            frameon=True,
+        )
+        for text in g.legend_.get_texts():
+            text.set_fontweight("bold")
 
         # save the figure
         plt.savefig(

From adac72b981cbb4950cc76694a4a0a4638d42c72b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 10 Jan 2025 18:30:46 -0500
Subject: [PATCH 177/401] minor

---
 task_dFC/generate_report.py | 70 -------------------------------------
 1 file changed, 70 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index f8b360b..fc16778 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -655,16 +655,6 @@ def plot_ML_results(
     plt.ylabel(g.get_ylabel(), fontweight="bold")
     plt.xticks(fontweight="bold")
     plt.yticks(fontweight="bold")
-    plt.legend(
-        title=g.legend_.get_title().get_text(),
-        title_fontsize="13",
-        title_fontweight="bold",
-        fontsize="11",
-        loc="best",
-        frameon=True,
-    )
-    for text in g.legend_.get_texts():
-        text.set_fontweight("bold")
     g.axhline(0.5, color="r", linestyle="--")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
@@ -773,16 +763,6 @@ def plot_clustering_results(
     plt.ylabel(g.get_ylabel(), fontweight="bold")
     plt.xticks(fontweight="bold")
     plt.yticks(fontweight="bold")
-    plt.legend(
-        title=g.legend_.get_title().get_text(),
-        title_fontsize="13",
-        title_fontweight="bold",
-        fontsize="11",
-        loc="best",
-        frameon=True,
-    )
-    for text in g.legend_.get_texts():
-        text.set_fontweight("bold")
     g.axhline(0.0, color="r", linestyle="--")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
@@ -832,16 +812,6 @@ def plot_clustering_results(
     plt.ylabel(g.get_ylabel(), fontweight="bold")
     plt.xticks(fontweight="bold")
     plt.yticks(fontweight="bold")
-    plt.legend(
-        title=g.legend_.get_title().get_text(),
-        title_fontsize="13",
-        title_fontweight="bold",
-        fontsize="11",
-        loc="best",
-        frameon=True,
-    )
-    for text in g.legend_.get_texts():
-        text.set_fontweight("bold")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
     if show_title:
@@ -944,16 +914,6 @@ def plot_paradigm_clustering_score(
     plt.ylabel(g.get_ylabel(), fontweight="bold")
     plt.xticks(fontweight="bold")
     plt.yticks(fontweight="bold")
-    plt.legend(
-        title=g.legend_.get_title().get_text(),
-        title_fontsize="13",
-        title_fontweight="bold",
-        fontsize="11",
-        loc="best",
-        frameon=True,
-    )
-    for text in g.legend_.get_texts():
-        text.set_fontweight("bold")
     g.axhline(0.0, color="r", linestyle="--")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
@@ -996,16 +956,6 @@ def plot_paradigm_clustering_score(
     plt.ylabel(g.get_ylabel(), fontweight="bold")
     plt.xticks(fontweight="bold")
     plt.yticks(fontweight="bold")
-    plt.legend(
-        title=g.legend_.get_title().get_text(),
-        title_fontsize="13",
-        title_fontweight="bold",
-        fontsize="11",
-        loc="best",
-        frameon=True,
-    )
-    for text in g.legend_.get_texts():
-        text.set_fontweight("bold")
     # set the y-axis upper limit to 1, but not set the lower limit
     g.set(ylim=(None, 1))
     if show_title:
@@ -1382,16 +1332,6 @@ def plot_task_presence_features(
         plt.ylabel(g.get_ylabel(), fontweight="bold")
         plt.xticks(fontweight="bold")
         plt.yticks(fontweight="bold")
-        plt.legend(
-            title=g.legend_.get_title().get_text(),
-            title_fontsize="13",
-            title_fontweight="bold",
-            fontsize="11",
-            loc="best",
-            frameon=True,
-        )
-        for text in g.legend_.get_texts():
-            text.set_fontweight("bold")
 
         # save the figure
         plt.savefig(
@@ -1417,16 +1357,6 @@ def plot_task_presence_features(
         plt.ylabel(g.get_ylabel(), fontweight="bold")
         plt.xticks(fontweight="bold")
         plt.yticks(fontweight="bold")
-        plt.legend(
-            title=g.legend_.get_title().get_text(),
-            title_fontsize="13",
-            title_fontweight="bold",
-            fontsize="11",
-            loc="best",
-            frameon=True,
-        )
-        for text in g.legend_.get_texts():
-            text.set_fontweight("bold")
 
         # save the figure
         plt.savefig(

From e4ea01de64c2bfeb5e26fb18776782fbc79da957 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 5 Feb 2025 13:28:04 -0500
Subject: [PATCH 178/401] add across_dataset analysis

---
 task_dFC/ML.py                                | 70 +++++++++----------
 task_dFC/across_dataset.py                    | 59 ++++++++++++++++
 .../run_scripts_slurm/multi_dataset_info.json |  6 ++
 3 files changed, 100 insertions(+), 35 deletions(-)
 create mode 100644 task_dFC/across_dataset.py
 create mode 100644 task_dFC/run_scripts_slurm/multi_dataset_info.json

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index a45f3ea..62b35a6 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -405,41 +405,41 @@ def run_clustering_for_visual(
         print(f"Error in classification for dFC ID {dFC_id}: {e}")
         traceback.print_exc()
     print(f"Task presence classification finished for dFC ID {dFC_id}.")
-    print(f"Task presence clustering started for dFC ID {dFC_id} ...")
-    try:
-        run_clustering(
-            dFC_id=dFC_id,
-            TASKS=TASKS,
-            RUNS=RUNS,
-            SESSIONS=SESSIONS,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            output_root=ML_root,
-            normalize_dFC=True,
-        )
-    except Exception as e:
-        print(f"Error in clustering for dFC ID {dFC_id}: {e}")
-        traceback.print_exc()
-
-    print(f"Task presence clustering finished for dFC ID {dFC_id}.")
-
-    print(f"Task paradigm clustering started for dFC ID {dFC_id} ...")
-    try:
-        run_task_paradigm_clustering(
-            dFC_id=dFC_id,
-            TASKS=TASKS,
-            RUNS=RUNS,
-            SESSIONS=SESSIONS,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            output_root=ML_root,
-            normalize_dFC=True,
-        )
-    except Exception as e:
-        print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}")
-        traceback.print_exc()
-
-    print(f"Task paradigm clustering finished for dFC ID {dFC_id}.")
+    # print(f"Task presence clustering started for dFC ID {dFC_id} ...")
+    # try:
+    #     run_clustering(
+    #         dFC_id=dFC_id,
+    #         TASKS=TASKS,
+    #         RUNS=RUNS,
+    #         SESSIONS=SESSIONS,
+    #         roi_root=roi_root,
+    #         dFC_root=dFC_root,
+    #         output_root=ML_root,
+    #         normalize_dFC=True,
+    #     )
+    # except Exception as e:
+    #     print(f"Error in clustering for dFC ID {dFC_id}: {e}")
+    #     traceback.print_exc()
+
+    # print(f"Task presence clustering finished for dFC ID {dFC_id}.")
+
+    # print(f"Task paradigm clustering started for dFC ID {dFC_id} ...")
+    # try:
+    #     run_task_paradigm_clustering(
+    #         dFC_id=dFC_id,
+    #         TASKS=TASKS,
+    #         RUNS=RUNS,
+    #         SESSIONS=SESSIONS,
+    #         roi_root=roi_root,
+    #         dFC_root=dFC_root,
+    #         output_root=ML_root,
+    #         normalize_dFC=True,
+    #     )
+    # except Exception as e:
+    #     print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}")
+    #     traceback.print_exc()
+
+    # print(f"Task paradigm clustering finished for dFC ID {dFC_id}.")
 
     print(f"Clustering for visualization started for dFC ID {dFC_id} ...")
     try:
diff --git a/task_dFC/across_dataset.py b/task_dFC/across_dataset.py
new file mode 100644
index 0000000..206a5af
--- /dev/null
+++ b/task_dFC/across_dataset.py
@@ -0,0 +1,59 @@
+import argparse
+import json
+import os
+import traceback
+
+import numpy as np
+
+from pydfc.ml_utils import (
+    cluster_for_visual,
+    extract_task_features,
+    task_paradigm_clustering,
+    task_presence_classification,
+    task_presence_clustering,
+)
+
+#######################################################################################
+
+
+def function_f():
+    pass
+
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to run across-dataset analysis on dFC results.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    print("Multi-Dataset Analysis started ...")
+
+    main_root = multi_dataset_info["main_root"]
+    DATASETS = multi_dataset_info["DATASETS"]
+
+    try:
+        function_f()
+    except Exception as e:
+        print(f"Error in task features extraction: {e}")
+        traceback.print_exc()
+    print("Task features extraction finished.")
+
+    print("Multi-Dataset Analysis finished.")
+
+#######################################################################################
diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
new file mode 100644
index 0000000..d2aa510
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -0,0 +1,6 @@
+{
+	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}",
+	"DATASETS" : [
+		"ses-1"
+	]
+}

From 837b4af16918dfa5ffc193ee6a736cb8b01c92b1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 5 Feb 2025 18:48:06 -0500
Subject: [PATCH 179/401] add other classification acc metrics

---
 pydfc/ml_utils.py           |  82 +++++++++++++++-----
 pydfc/report_util.py        |  80 ++++++++++++++++++++
 task_dFC/ML.py              |  17 +----
 task_dFC/generate_report.py | 146 ++++++++++++++++--------------------
 4 files changed, 211 insertions(+), 114 deletions(-)
 create mode 100644 pydfc/report_util.py

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 4333fbb..112be80 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -16,7 +16,17 @@
 from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.manifold import SpectralEmbedding
-from sklearn.metrics import adjusted_rand_score, balanced_accuracy_score, silhouette_score
+from sklearn.metrics import (
+    accuracy_score,
+    adjusted_rand_score,
+    average_precision_score,
+    balanced_accuracy_score,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    silhouette_score,
+)
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
 from sklearn.pipeline import make_pipeline
@@ -1098,10 +1108,6 @@ def task_presence_classification(
         "task": list(),
         "run": list(),
         "dFC method": list(),
-        "Logistic regression accuracy": list(),
-        "KNN accuracy": list(),
-        # "Random Forest accuracy": list(),
-        # "Gradient Boosting accuracy": list(),
         "embedding": list(),
     }
     for embedding in ["PCA", "LE"]:
@@ -1161,6 +1167,8 @@ def task_presence_classification(
         # RF = RF_RESULT["RF_model"]
         # GBT = GBT_RESULT["GB_model"]
 
+        ML_models = {"Logistic regression": log_reg, "KNN": KNN}
+
         for subj in SUBJECTS:
             ML_scores["subj_id"].append(subj)
             if subj in train_subjects:
@@ -1172,21 +1180,55 @@ def task_presence_classification(
                 features = X_test_embedded[subj_label_test == subj, :]
                 target = y_test[subj_label_test == subj]
 
-            pred_lr = log_reg.predict(features)
-            pred_KNN = KNN.predict(features)
-            # pred_RF = RF.predict(features)
-            # pred_GBT = GBT.predict(features)
-
-            ML_scores["Logistic regression accuracy"].append(
-                balanced_accuracy_score(target, pred_lr)
-            )
-            ML_scores["KNN accuracy"].append(balanced_accuracy_score(target, pred_KNN))
-            # ML_scores["Random Forest accuracy"].append(
-            #     balanced_accuracy_score(target, pred_RF)
-            # )
-            # ML_scores["Gradient Boosting accuracy"].append(
-            #     balanced_accuracy_score(target, pred_GBT)
-            # )
+            # measure pred score using different metrics on each subj
+            for model_name, model in ML_models.items():
+                pred = model.predict(features)
+                # accuracy score
+                if not f"{model_name} accuracy" in ML_scores:
+                    ML_scores[f"{model_name} accuracy"] = list()
+                ML_scores[f"{model_name} accuracy"].append(accuracy_score(target, pred))
+                # balanced accuracy score
+                if not f"{model_name} balanced accuracy" in ML_scores:
+                    ML_scores[f"{model_name} balanced accuracy"] = list()
+                ML_scores[f"{model_name} balanced accuracy"].append(
+                    balanced_accuracy_score(target, pred)
+                )
+                # precision score
+                if not f"{model_name} precision" in ML_scores:
+                    ML_scores[f"{model_name} precision"] = list()
+                ML_scores[f"{model_name} precision"].append(precision_score(target, pred))
+                # recall score
+                if not f"{model_name} recall" in ML_scores:
+                    ML_scores[f"{model_name} recall"] = list()
+                ML_scores[f"{model_name} recall"].append(recall_score(target, pred))
+                # f1 score
+                if not f"{model_name} f1" in ML_scores:
+                    ML_scores[f"{model_name} f1"] = list()
+                ML_scores[f"{model_name} f1"].append(f1_score(target, pred))
+                # confusion matrix
+                tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
+                # false positive rate
+                if not f"{model_name} fp" in ML_scores:
+                    ML_scores[f"{model_name} fp"] = list()
+                ML_scores[f"{model_name} fp"].append(fp)
+                # false negative rate
+                if not f"{model_name} fn" in ML_scores:
+                    ML_scores[f"{model_name} fn"] = list()
+                ML_scores[f"{model_name} fn"].append(fn)
+                # true positive rate
+                if not f"{model_name} tp" in ML_scores:
+                    ML_scores[f"{model_name} tp"] = list()
+                ML_scores[f"{model_name} tp"].append(tp)
+                # true negative rate
+                if not f"{model_name} tn" in ML_scores:
+                    ML_scores[f"{model_name} tn"] = list()
+                ML_scores[f"{model_name} tn"].append(tn)
+                # average precision score
+                if not f"{model_name} average precision" in ML_scores:
+                    ML_scores[f"{model_name} average precision"] = list()
+                ML_scores[f"{model_name} average precision"].append(
+                    average_precision_score(target, pred)
+                )
 
             ML_scores["task"].append(task)
             ML_scores["run"].append(run)
diff --git a/pydfc/report_util.py b/pydfc/report_util.py
new file mode 100644
index 0000000..53d9a49
--- /dev/null
+++ b/pydfc/report_util.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+"""
+Functions to facilitate reporting.
+
+Created on Feb 5 2025
+@author: Mohammad Torabi
+"""
+
+import os
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+################################# Parameters ####################################
+
+fig_dpi = 120
+fig_bbox_inches = "tight"
+fig_pad = 0.1
+show_title = True
+save_fig_format = "png"  # pdf, png,
+
+########## Plotting Classification Results Functions ##########
+
+
+def plot_classification_metrics(
+    dataframe, ML_algorithm, pred_metric, title, suffix, output_dir
+):
+    """
+    This function plots these metrics:
+    - accuracy
+    - balanced accuracy
+    - precision
+    - recall
+    - f1 score (f1)
+    - true positive (tp)
+    - true negative (tn)
+    - false positive (fp)
+    - false negative (fn)
+    - average precision
+    """
+
+    plt.figure(figsize=(10, 5))
+
+    g = sns.pointplot(
+        data=dataframe,
+        x="dFC method",
+        y=f"{ML_algorithm} {pred_metric}",
+        hue="group",
+        hue_order=["train", "test"],
+        errorbar="sd",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+    if pred_metric == "balanced accuracy":
+        # add a horizontal line at 0.5 corresponding to chance level
+        g.axhline(0.5, color="r", linestyle="--")
+    if not pred_metric in ["fp", "fn", "tp", "tn"]:
+        # set the y-axis upper limit to 1, but not set the lower limit
+        g.set(ylim=(None, 1))
+    if show_title:
+        g.set_title(title, fontdict={"fontsize": 10, "fontweight": "bold"})
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    pred_metric_no_space = pred_metric.replace(" ", "_")
+    plt.savefig(
+        f"{output_dir}/classification_{pred_metric_no_space}_{suffix}.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 62b35a6..f2888a5 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -78,19 +78,8 @@ def run_classification(
     for session in SESSIONS:
         if not session is None:
             print(f"=================== {session} ===================")
-        ML_scores = {
-            "subj_id": list(),
-            "group": list(),
-            "task": list(),
-            "run": list(),
-            "dFC method": list(),
-            "Logistic regression accuracy": list(),
-            "KNN accuracy": list(),
-            # "Random Forest accuracy": list(),
-            # "Gradient Boosting accuracy": list(),
-            "embedding": list(),
-        }
 
+        ML_scores = {}
         ML_RESULT = {}
         for task_id, task in enumerate(TASKS):
             ML_RESULT[task] = {}
@@ -110,7 +99,9 @@ def run_classification(
                         ML_RESULT[task] = ML_RESULT_new
                     else:
                         ML_RESULT[task][run] = ML_RESULT_new
-                    for key in ML_scores:
+                    for key in ML_scores_new:
+                        if key not in ML_scores:
+                            ML_scores[key] = list()
                         ML_scores[key].extend(ML_scores_new[key])
                 except Exception as e:
                     print(
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index fc16778..1fb7159 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -20,6 +20,7 @@
     rank_norm,
     visualize_conn_mat_dict,
 )
+from pydfc.report_util import plot_classification_metrics
 
 ################################# Parameters ####################################
 
@@ -586,17 +587,17 @@ def plot_dFC_matrices(
         )
 
 
-def plot_ML_results(
+def plot_classification_results(
     ML_root,
     output_root,
     task,
     run=None,
     session=None,
-    ML_algorithm="Random Forest",
+    ML_algorithm="KNN",
     embedding="PCA",
 ):
     """
-    Plot the ML results for a given task, run and session.
+    Plot the ML classification results for a given task, run and session.
     parameters:
     ----------
         ML_root: str, path to ML results
@@ -638,38 +639,12 @@ def plot_ML_results(
     dataframe = dataframe[dataframe["task"] == task]
     dataframe = dataframe[dataframe["embedding"] == embedding]
 
-    plt.figure(figsize=(10, 5))
-
-    g = sns.pointplot(
-        data=dataframe,
-        x="dFC method",
-        y=f"{ML_algorithm} accuracy",
-        hue="group",
-        hue_order=["train", "test"],
-        errorbar="sd",
-        linestyle="none",
-        dodge=True,
-        capsize=0.1,
-    )
-    plt.xlabel(g.get_xlabel(), fontweight="bold")
-    plt.ylabel(g.get_ylabel(), fontweight="bold")
-    plt.xticks(fontweight="bold")
-    plt.yticks(fontweight="bold")
-    g.axhline(0.5, color="r", linestyle="--")
-    # set the y-axis upper limit to 1, but not set the lower limit
-    g.set(ylim=(None, 1))
-    if show_title:
-        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
-
     # save the figure
     if session is None:
         output_dir = f"{output_root}/group_results/classification"
     else:
         output_dir = f"{output_root}/group_results/classification/{session}"
 
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
     if ML_algorithm == "Logistic regression":
         ML_algorithm_name = "LogReg"
     elif ML_algorithm == "KNN":
@@ -680,23 +655,32 @@ def plot_ML_results(
         ML_algorithm_name = "GBT"
 
     if run is None:
-        plt.savefig(
-            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{embedding}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
+        suffix = f"{ML_algorithm_name}_{task}_{embedding}"
     else:
-        plt.savefig(
-            f"{output_dir}/ML_results_classify_{ML_algorithm_name}_{task}_{run}_{embedding}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
+        suffix = f"{ML_algorithm_name}_{task}_{run}_{embedding}"
+
+    metrics = [
+        "accuracy",
+        "balanced accuracy",
+        "precision",
+        "recall",
+        "f1",
+        "tp",
+        "tn",
+        "fp",
+        "fn",
+        "average precision",
+    ]
 
-    plt.close()
+    for metric in metrics:
+        plot_classification_metrics(
+            dataframe=dataframe,
+            ML_algorithm=ML_algorithm,
+            pred_metric=metric,
+            title=task,
+            suffix=suffix,
+            output_dir=output_dir,
+        )
 
 
 def plot_clustering_results(
@@ -1579,6 +1563,19 @@ def create_html_report_group_results(
     file.write("<br>\n")
 
     # classification results
+    metrics = [
+        "accuracy",
+        "balanced accuracy",
+        "precision",
+        "recall",
+        "f1",
+        "tp",
+        "tn",
+        "fp",
+        "fn",
+        "average precision",
+    ]
+    classification_models = {"LogReg": "Logistic Regression", "KNN": "KNN"}
     img_height = 300
     file.write("<h1>Classification Results</h1>\n")
     for session in SESSIONS:
@@ -1594,43 +1591,30 @@ def create_html_report_group_results(
                 else:
                     classification_dir = f"{group_dir}/classification"
 
-                for embedding in ["PCA", "LE"]:
-                    file.write(f"<h3>{embedding}</h3>\n")
-                    # display KNN classification results
-                    file.write("<h3>KNN</h3>\n")
-                    if run is None:
-                        classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{embedding}.png"
-                    else:
-                        classification_img = f"{classification_dir}/ML_results_classify_KNN_{task}_{run}_{embedding}.png"
-                    if os.path.exists(classification_img):
-                        img = plt.imread(classification_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        classification_img = classification_img.replace(group_dir, ".")
-                        file.write(
-                            f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                        )
-
-                    # display Logistic regression classification results
-                    file.write("<h3>Logistic Regression</h3>\n")
-                    if run is None:
-                        classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{embedding}.png"
-                    else:
-                        classification_img = f"{classification_dir}/ML_results_classify_LogReg_{task}_{run}_{embedding}.png"
-                    if os.path.exists(classification_img):
-                        img = plt.imread(classification_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        classification_img = classification_img.replace(group_dir, ".")
-                        file.write(
-                            f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
-                        )
+                for model in classification_models:
+                    file.write(f"<h3>{classification_models[model]}</h3>\n")
+                    for embedding in ["PCA", "LE"]:
+                        file.write(f"<h3>{embedding}</h3>\n")
+                        for metric in metrics:
+                            metric_no_space = metric.replace(" ", "_")
+                            if run is None:
+                                classification_img = f"{classification_dir}/classification_{metric_no_space}_{model}_{task}_{embedding}.png"
+                            else:
+                                classification_img = f"{classification_dir}/classification_{metric_no_space}_{model}_{task}_{run}_{embedding}.png"
+                            if os.path.exists(classification_img):
+                                img = plt.imread(classification_img)
+                                height, width, _ = img.shape
+                                # change the width so that height equals img_height
+                                width = int(width * img_height / height)
+                                # replace the path to the image with a relative path
+                                classification_img = classification_img.replace(
+                                    group_dir, "."
+                                )
+                                file.write(
+                                    f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
+                                )
 
-                    file.write("<br>\n")
+                            file.write("<br>\n")
 
     # clustering results
     img_height = 300
@@ -2128,7 +2112,7 @@ def create_html_report_group_results(
                 for embedding in ["PCA", "LE"]:
                     for ML_algorithm in ["KNN", "Logistic regression"]:
                         try:
-                            plot_ML_results(
+                            plot_classification_results(
                                 ML_root=ML_root,
                                 output_root=reports_root,
                                 task=task,

From 7c8a1c9cb947571761c02267fe986cba68ddeed5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 10 Feb 2025 12:46:43 -0500
Subject: [PATCH 180/401] minor

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index d58fa74..60b462a 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=fmriprep_job       # Name of the job
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
-#SBATCH --time=2-00:00:00                # Walltime (2 day)
+#SBATCH --time=4-00:00:00                # Walltime (4 day)
 #SBATCH --mem-per-cpu=16G                # Memory (16 GB) per cpu
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 

From 5fab5606100affdd68fc7b6db53baf83b93919eb Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 11 Mar 2025 14:36:26 -0400
Subject: [PATCH 181/401] update across_dataset

---
 task_dFC/across_dataset.py                    | 113 ++++++++++++++++--
 .../run_scripts_slurm/multi_dataset_info.json |   5 +-
 2 files changed, 104 insertions(+), 14 deletions(-)

diff --git a/task_dFC/across_dataset.py b/task_dFC/across_dataset.py
index 206a5af..c79d099 100644
--- a/task_dFC/across_dataset.py
+++ b/task_dFC/across_dataset.py
@@ -5,19 +5,108 @@
 
 import numpy as np
 
-from pydfc.ml_utils import (
-    cluster_for_visual,
-    extract_task_features,
-    task_paradigm_clustering,
-    task_presence_classification,
-    task_presence_clustering,
-)
+from pydfc.dfc_utils import dFC_mat2vec
 
 #######################################################################################
 
 
-def function_f():
-    pass
+def get_dataset_info(main_root, dataset):
+    # get the dataset_info.json
+    dataset_info_path = os.path.join(main_root, dataset, "codes", "dataset_info.json")
+    with open(dataset_info_path, "r") as f:
+        dataset_info = json.load(f)
+
+    TASKS = dataset_info["TASKS"]
+    if "RUNS" in dataset_info:
+        RUNS = dataset_info["RUNS"]
+    else:
+        RUNS = None
+    if RUNS is None:
+        RUNS = {task: [None] for task in TASKS}
+
+    if "SESSIONS" in dataset_info:
+        SESSIONS = dataset_info["SESSIONS"]
+    else:
+        SESSIONS = None
+    if SESSIONS is None:
+        SESSIONS = [None]
+
+    if "{dataset}" in dataset_info["main_root"]:
+        dataset_main_root = dataset_info["main_root"].replace("{dataset}", dataset)
+    else:
+        dataset_main_root = dataset_info["main_root"]
+
+    if "{main_root}" in dataset_info["ML_root"]:
+        ML_root = dataset_info["ML_root"].replace("{main_root}", dataset_main_root)
+    else:
+        ML_root = dataset_info["ML_root"]
+
+    return TASKS, RUNS, SESSIONS, ML_root
+
+
+def run_across_dataset_analysis(main_root, DATASETS):
+    """_summary_
+
+    Parameters
+    ----------
+    main_root : str
+        the main root of the datasets
+    DATASETS : list
+        the list of datasets
+    """
+    RESULTS = {
+        "centroids_mat": [],
+        "task": [],
+        "run": [],
+        "session": [],
+        "measure_name": [],
+        "dataset": [],
+    }
+    for dataset in DATASETS:
+
+        TASKS, RUNS, SESSIONS, ML_root = get_dataset_info(main_root, dataset)
+
+        # Load data
+        # look for all centroids files
+        # dataset_root/ML_root/centroids/session/centroids_{session}_{task}_{run}_{measure_name}.npy
+        for session in SESSIONS:
+            if session is None:
+                input_path = os.path.join(ML_root, "centroids")
+            else:
+                input_path = os.path.join(ML_root, "centroids", session)
+            ALL_CENTROIDS_FILES = os.listdir(input_path)
+            ALL_CENTROIDS_FILES = [f for f in ALL_CENTROIDS_FILES if "centroids_" in f]
+            for task in TASKS:
+                for run in RUNS[task]:
+                    centroids_files = [f for f in ALL_CENTROIDS_FILES if f"_{task}_" in f]
+                    if run is not None:
+                        centroids_files = [f for f in centroids_files if f"_{run}_" in f]
+                    if session is not None:
+                        centroids_files = [
+                            f for f in centroids_files if f"_{session}_" in f
+                        ]
+                    for centroids_file in centroids_files:
+                        measure_name = centroids_file.split("_")[-1].replace(".npy", "")
+                        centroids = np.load(os.path.join(input_path, centroids_file))
+                        centroids_mat = centroids[
+                            "centroids_mat"
+                        ]  # shape: (n_clusters, n_regions, n_regions)
+                        centroids_mat = dFC_mat2vec(
+                            centroids_mat
+                        )  # shape: (n_clusters, n_regions*(n_regions-1)/2)
+                        for i in range(centroids_mat.shape[0]):
+                            RESULTS["centroids_mat"].append(centroids_mat[i])
+                            RESULTS["task"].append(task)
+                            RESULTS["run"].append(run)
+                            RESULTS["session"].append(session)
+                            RESULTS["measure_name"].append(measure_name)
+                            RESULTS["dataset"].append(dataset)
+
+    # give statistics
+    print(f"Number of centroids: {len(RESULTS['centroids_mat'])}")
+    print(f"Number of tasks: {len(set(RESULTS['task']))}")
+    print(f"Number of measure_names: {len(set(RESULTS['measure_name']))}")
+    print(f"Number of datasets: {len(set(RESULTS['dataset']))}")
 
 
 #######################################################################################
@@ -48,11 +137,11 @@ def function_f():
     DATASETS = multi_dataset_info["DATASETS"]
 
     try:
-        function_f()
+        run_across_dataset_analysis()
     except Exception as e:
-        print(f"Error in task features extraction: {e}")
+        print(f"Error in run_across_dataset_analysis: {e}")
         traceback.print_exc()
-    print("Task features extraction finished.")
+    print("run_across_dataset_analysis finished.")
 
     print("Multi-Dataset Analysis finished.")
 
diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
index d2aa510..93a5a69 100644
--- a/task_dFC/run_scripts_slurm/multi_dataset_info.json
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -1,6 +1,7 @@
 {
-	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}",
+	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
 	"DATASETS" : [
-		"ses-1"
+		"ds001734", "ds002843", "ds003465", "ds004044", "ds004359", "ds004746",
+		"ds002647", "ds002994", "ds003612", "ds004302", "ds004556"
 	]
 }

From ac0dab30343714fe6fa9fd58287ae161862c9cb6 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 11 Mar 2025 14:45:30 -0400
Subject: [PATCH 182/401] add affinity mat to across_dataset

---
 task_dFC/across_dataset.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/task_dFC/across_dataset.py b/task_dFC/across_dataset.py
index c79d099..c7f6f9f 100644
--- a/task_dFC/across_dataset.py
+++ b/task_dFC/across_dataset.py
@@ -44,6 +44,40 @@ def get_dataset_info(main_root, dataset):
     return TASKS, RUNS, SESSIONS, ML_root
 
 
+def plot_affinity_matrix(centroids_mat, save_path=None):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    from sklearn.neighbors import kneighbors_graph
+
+    fig_dpi = 120
+    fig_bbox_inches = "tight"
+    fig_pad = 0.1
+    save_fig_format = "png"  # pdf, png,
+
+    X = np.array(centroids_mat)  # shape: (n_centroids, n_regions*(n_regions-1)/2)
+
+    affinity_matrix = kneighbors_graph(
+        X,
+        n_neighbors=125,
+        mode="connectivity",
+        include_self=False,
+        metric="correlation",
+    )
+
+    # plot a heatmap of the affinity matrix
+    plt.figure(figsize=(10, 10))
+    sns.heatmap(affinity_matrix.toarray())
+    if save_path is not None:
+        plt.savefig(
+            save_path,
+            format=save_fig_format,
+            bbox_inches=fig_bbox_inches,
+            dpi=fig_dpi,
+            pad_inches=fig_pad,
+        )
+    plt.close()
+
+
 def run_across_dataset_analysis(main_root, DATASETS):
     """_summary_
 
@@ -108,6 +142,9 @@ def run_across_dataset_analysis(main_root, DATASETS):
     print(f"Number of measure_names: {len(set(RESULTS['measure_name']))}")
     print(f"Number of datasets: {len(set(RESULTS['dataset']))}")
 
+    # plot the affinity matrix
+    plot_affinity_matrix(RESULTS["centroids_mat"], save_path="affinity_matrix.png")
+
 
 #######################################################################################
 

From 60d6eae8708ec027f99825de43858d6733dc4d9c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 1 Apr 2025 23:09:54 +0330
Subject: [PATCH 183/401] test glasso

---
 pydfc/dfc_methods/sliding_window.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pydfc/dfc_methods/sliding_window.py b/pydfc/dfc_methods/sliding_window.py
index ac202fd..7b6a04f 100644
--- a/pydfc/dfc_methods/sliding_window.py
+++ b/pydfc/dfc_methods/sliding_window.py
@@ -102,7 +102,8 @@ def FC(self, time_series):
             mean = np.mean(time_series, axis=1, keepdims=True)
             std = np.std(time_series, axis=1, keepdims=True)
             time_series_standardized = np.where(std != 0, (time_series - mean) / std, 0)
-            model = GraphicalLasso(alpha=self.graphical_lasso_alpha_)
+            # model = GraphicalLasso(alpha=self.graphical_lasso_alpha_)
+            model = GraphicalLasso(alpha=0.1)
             model.fit(time_series_standardized.T)
             # the covariance matrix will equal the correlation matrix
             C = model.covariance_

From 4fcda4b52d704b7ef6c758bb4804f0cb331ded19 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 9 Apr 2025 11:59:17 +0330
Subject: [PATCH 184/401] change the glasso back

---
 pydfc/dfc_methods/sliding_window.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pydfc/dfc_methods/sliding_window.py b/pydfc/dfc_methods/sliding_window.py
index 7b6a04f..ac202fd 100644
--- a/pydfc/dfc_methods/sliding_window.py
+++ b/pydfc/dfc_methods/sliding_window.py
@@ -102,8 +102,7 @@ def FC(self, time_series):
             mean = np.mean(time_series, axis=1, keepdims=True)
             std = np.std(time_series, axis=1, keepdims=True)
             time_series_standardized = np.where(std != 0, (time_series - mean) / std, 0)
-            # model = GraphicalLasso(alpha=self.graphical_lasso_alpha_)
-            model = GraphicalLasso(alpha=0.1)
+            model = GraphicalLasso(alpha=self.graphical_lasso_alpha_)
             model.fit(time_series_standardized.T)
             # the covariance matrix will equal the correlation matrix
             C = model.covariance_

From ad962f7cfa2415da77a8f862e28f08a040bdcf65 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 30 Apr 2025 15:06:41 -0400
Subject: [PATCH 185/401] add binarizing_method shift

---
 pydfc/ml_utils.py           |  4 ++--
 pydfc/task_utils.py         | 37 +++++++++++++++++++++++++++++++------
 task_dFC/generate_report.py |  4 ++--
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 112be80..38adb22 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -181,7 +181,7 @@ def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root, no_hrf=False
                     TR_task=TR_task,
                     TR_mri=task_data["TR_mri"],
                     binary=True,
-                    binarizing_method="mean",
+                    binarizing_method="shift",
                     no_hrf=no_hrf,
                 )
 
@@ -237,7 +237,7 @@ def dFC_feature_extraction_subj_lvl(
         TR_mri=task_data["TR_mri"],
         TR_array=TR_array,
         binary=True,
-        binarizing_method="mean",
+        binarizing_method="shift",
     )
 
     features = dFC_vecs
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index a750494..1e61bdc 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -266,6 +266,25 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"):
     return events_hrf_ds
 
 
+def shifted_binarizing(
+    event_labels_all_task_hrf,
+    task_presence_ratio=0.5,
+    step=0.001,
+):
+    # find threshold such that the after binarization of event_labels_all_task_hrf,
+    # the ratio of 1 to 0 is equal to task_presence_ratio
+    for threshold in np.arange(0, np.max(event_labels_all_task_hrf), step):
+        # binarize the event_labels_all_task_hrf
+        event_labels_all_task_hrf_binarized = np.where(
+            event_labels_all_task_hrf > threshold, 1, 0
+        )
+        # find the ratio of 1 to 0 in event_labels_all_task_hrf_binarized
+        new_ratio = np.mean(event_labels_all_task_hrf_binarized)
+        if new_ratio <= task_presence_ratio:
+            break
+    return threshold
+
+
 def extract_task_presence(
     event_labels,
     TR_task,
@@ -284,7 +303,8 @@ def extract_task_presence(
     This function extracts the task presence from the event labels and returns it in the same time points as the dFC data
     It also downsamples the task presence to the time points of the dFC data
     if binary is True, the task presence is binarized using the mean of the task presence
-    binarizing_method: 'median' or 'mean'
+    binarizing_method: 'median' or 'mean' or 'shift'
+    if binarizing_method is 'shift', the task presence is binarized such that the ratio of 1 to 0 is equal to the task presence ratio
 
     if no_hrf is True, the task presence is not convolved with HRF
     """
@@ -307,13 +327,18 @@ def extract_task_presence(
 
     if binary:
         if binarizing_method == "median":
-            task_presence = np.where(
-                event_labels_all_task_hrf > np.median(event_labels_all_task_hrf), 1, 0
-            )
+            threshold = np.median(event_labels_all_task_hrf)
         elif binarizing_method == "mean":
-            task_presence = np.where(
-                event_labels_all_task_hrf > np.mean(event_labels_all_task_hrf), 1, 0
+            threshold = np.mean(event_labels_all_task_hrf)
+        elif binarizing_method == "shift":
+            task_presence_ratio = np.mean(event_labels_all_task)
+            threshold = shifted_binarizing(
+                event_labels_all_task_hrf=event_labels_all_task_hrf,
+                task_presence_ratio=task_presence_ratio,
             )
+        else:
+            raise ValueError("binarizing_method should be 'median', 'mean' or 'shift'")
+        task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
     else:
         task_presence = event_labels_all_task_hrf
 
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 1fb7159..4fa683a 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -419,7 +419,7 @@ def plot_task_presence(
         TR_task=TR_task,
         TR_mri=task_data["TR_mri"],
         binary=True,
-        binarizing_method="mean",
+        binarizing_method="shift",
     )
 
     time = np.arange(0, task_presence.shape[0]) / Fs_mri
@@ -487,7 +487,7 @@ def calculate_subj_lvl_task_presence_characteristics(
         TR_task=TR_task,
         TR_mri=task_data["TR_mri"],
         binary=True,
-        binarizing_method="mean",
+        binarizing_method="shift",
     )
     relative_task_on = task_utils.calc_relative_task_on(task_presence)
     # task duration

From 971a1fff3577d2f04792de3a427aebfe69e79346 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 1 May 2025 23:53:24 -0400
Subject: [PATCH 186/401] update report gen

---
 task_dFC/generate_report.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 4fa683a..a39559d 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -433,12 +433,7 @@ def plot_task_presence(
         time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
     )
     plt.plot(time[start_TR:end_TR], task_presence[start_TR:end_TR], linewidth=4)
-    # plot mean of task presence_non_binarized as a line
-    plt.plot(
-        time[start_TR:end_TR],
-        np.mean(task_presence_non_binarized) * np.ones_like(time[start_TR:end_TR]),
-        linewidth=4,
-    )
+
     # put vertical lines at the start of each TR
     for TR in range(start_TR, end_TR):
         plt.axvline(x=TR * TR_mri, color="r", linestyle="--")

From 0e140300991a6679cdea5ce3627c08449af16204 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 2 May 2025 11:42:16 -0400
Subject: [PATCH 187/401] use bold.json in fmriprep root

---
 task_dFC/nifti_to_roi_signal.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index 52dc1c8..ec14515 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -63,7 +63,19 @@ def run_roi_signal_extraction(
         else:
             nifti_file = f"{fmriprep_root}/{subj}/{session}/func/{task_file}"
             task_events_root = f"{bids_root}/{subj}/{session}/func"
-        info_file = f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}"
+        # we need the info file to get the TR
+        # we can find the acquisition data in either the fmriprep folder
+        # or in the bids folder
+        # BUT for multi-echo data, we must use the fmriprep folder
+        # because the bids folder contains multiple files for each echo
+        # so first we check if the file exists in the fmriprep folder
+        # and if not, we check the bids folder
+        # the info file is the same as the nifti file but with a .json extension
+        info_file = nifti_file.replace(".nii.gz", ".json")
+        if not os.path.exists(info_file):
+            info_file = (
+                f"{task_events_root}/{task_file.replace(bold_suffix, '_bold.json')}"
+            )
 
         if os.path.exists(info_file):
             f = open(info_file)

From 0cb1b0248e36cd2a928ac796235ee376a662c9f7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 2 May 2025 12:07:59 -0400
Subject: [PATCH 188/401] change data loader to catch a bug

---
 pydfc/data_loader.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pydfc/data_loader.py b/pydfc/data_loader.py
index 7626cd0..d72c552 100644
--- a/pydfc/data_loader.py
+++ b/pydfc/data_loader.py
@@ -385,7 +385,15 @@ def load_TS(
         if TS is None:
             TS = time_series
         else:
-            TS.concat_ts(time_series)
+            try:
+                TS.concat_ts(time_series)
+            except AssertionError as e:
+                # print the error message
+                print(f"Error in concatenating time series for {subj}: {e}")
+                # raise error with a message and stop the program
+                raise Exception(
+                    f"Fs of subj {subj} TS is {time_series.Fs} while the group Fs is {TS.Fs}"
+                )
 
     return TS
 

From e8a0700b32ba498414811eb1ed26d40e02500656 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 6 May 2025 13:12:06 -0400
Subject: [PATCH 189/401] minor change in run_fmriprep

---
 task_dFC/run_scripts_slurm/run_fmriprep.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 60b462a..60e7da4 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=fmriprep_job       # Name of the job
 #SBATCH --output=logs/fmriprep_out.log  # Standard output log
 #SBATCH --error=logs/fmriprep_err.log   # Standard error log
-#SBATCH --time=4-00:00:00                # Walltime (4 day)
 #SBATCH --mem-per-cpu=16G                # Memory (16 GB) per cpu
 #SBATCH --cpus-per-task=8              # Number of CPU cores (increase based on availability)
 

From 59e7da303901648f1e4fe05f95bf86581cec3866 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 6 May 2025 13:20:21 -0400
Subject: [PATCH 190/401] minor

---
 task_dFC/dFC_assessment.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index 1ca06ad..2be912a 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -216,16 +216,22 @@ def run_dFC_assess(
     for session in SESSIONS:
         for task in TASKS:
             for run in RUNS[task]:
-                run_dFC_assess(
-                    subj_id=participant_id,
-                    task=task,
-                    roi_root=roi_root,
-                    fitted_measures_root=fitted_measures_root,
-                    output_root=output_root,
-                    params_multi_analysis=params_multi_analysis,
-                    session=session,
-                    run=run,
-                )
+                try:
+                    run_dFC_assess(
+                        subj_id=participant_id,
+                        task=task,
+                        roi_root=roi_root,
+                        fitted_measures_root=fitted_measures_root,
+                        output_root=output_root,
+                        params_multi_analysis=params_multi_analysis,
+                        session=session,
+                        run=run,
+                    )
+                except Exception as e:
+                    print(
+                        f"Error in dFC assessment for subject {participant_id}, task {task}, session {session}, run {run}: {e}"
+                    )
+                    continue
 
     print(
         f"subject-level dFC assessment CODE finished running for subject: {participant_id}"

From aad52f0308fb2af738af30767da26c721f161e46 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 13 May 2025 21:07:01 -0400
Subject: [PATCH 191/401] improve procrustes

---
 pydfc/ml_utils.py | 70 +++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 38adb22..8e6334f 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -47,6 +47,9 @@
 def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None):
     """
     Find the subjects that have dFC results for the given task and dFC_id (method).
+
+    If run and session are specified, the dFC results for that run and session will be used.
+    Otherwise, the subjects that have dFC results at least for one run and session will returned.
     """
     SUBJECTS = list()
     ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
@@ -413,12 +416,46 @@ def precheck_for_procruste(X_best, X_subj):
     return X_best_new
 
 
-def generalized_procrustes(X_list):
+def generalized_procrustes(X_embed_dict):
     """
     Generalized Procrustes Analysis
 
-    returns the mean X to be used as the reference for procrustes transformation
+    X_embed_dict: dict
+        dict of scans and their embeddings
+
+    returns the mean X across scans to be used as the reference for procrustes transformation
     """
+    # initial step
+    # not all scans have the same number of samples
+    # find the max number of samples among all scans
+    max_samples = 0
+    for scan in X_embed_dict:
+        if X_embed_dict[scan].shape[0] > max_samples:
+            max_samples = X_embed_dict[scan].shape[0]
+
+    # find the mean embedding of all scan to use as the reference for procrustes transformation
+    X_list = []
+    for scan in X_embed_dict:
+        X_scan_embed = X_embed_dict[scan]
+        # add zero rows to the embedding of the scan with less samples
+        if X_scan_embed.shape[0] < max_samples:
+            X_scan_embed_new = np.concatenate(
+                (
+                    X_scan_embed,
+                    np.zeros(
+                        (
+                            max_samples - X_scan_embed.shape[0],
+                            X_scan_embed.shape[1],
+                        )
+                    ),
+                ),
+                axis=0,
+            )
+        else:
+            X_scan_embed_new = X_scan_embed
+        X_list.append(X_scan_embed_new)
+
+    # now iteratively find the mean X for transform
     for iter_num in range(100):
 
         try:
@@ -750,34 +787,7 @@ def LE_embed_procustes(
             )
             embed_dict[subject] = X_subj_embed
 
-        # then find the max number of samples among all subjects
-        max_samples = 0
-        for subject in train_subjects:
-            if embed_dict[subject].shape[0] > max_samples:
-                max_samples = embed_dict[subject].shape[0]
-
-        # find the mean embedding of all subjects to use as the reference for procrustes transformation
-        X_train_list = []
-        for subject in train_subjects:
-            X_subj_embed = embed_dict[subject]
-            # add zero rows to the embedding of the subject with less samples
-            if X_subj_embed.shape[0] < max_samples:
-                X_subj_embed_new = np.concatenate(
-                    (
-                        X_subj_embed,
-                        np.zeros(
-                            (
-                                max_samples - X_subj_embed.shape[0],
-                                X_subj_embed.shape[1],
-                            )
-                        ),
-                    ),
-                    axis=0,
-                )
-            else:
-                X_subj_embed_new = X_subj_embed
-            X_train_list.append(X_subj_embed_new)
-        mean_X_train = generalized_procrustes(X_train_list)
+        mean_X_train = generalized_procrustes(embed_dict)
 
         X_train_embed = None
         for subject in train_subjects:

From c057c4706c51eaf92b7c94c85311cfb3ec478202 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 15 May 2025 20:37:51 -0400
Subject: [PATCH 192/401] fix bug in find_available_subjects

---
 pydfc/ml_utils.py | 129 +++-------------------------------------------
 task_dFC/ML.py    |  62 ----------------------
 2 files changed, 8 insertions(+), 183 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 8e6334f..1c3bf43 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -48,8 +48,12 @@ def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None)
     """
     Find the subjects that have dFC results for the given task and dFC_id (method).
 
-    If run and session are specified, the dFC results for that run and session will be used.
-    Otherwise, the subjects that have dFC results at least for one run and session will returned.
+    If run is specified, the dFC results for that run will be used.
+    Otherwise, the subjects that have dFC results at least for one run will returned.
+
+    If session is specified, the dFC results for that session will be used.
+    Otherwise, it is considered that the dataset does not have session information.
+    Note that not specifying session will cause error if the dataset has session information.
     """
     SUBJECTS = list()
     ALL_SUBJ_FOLDERS = os.listdir(f"{dFC_root}/")
@@ -59,6 +63,8 @@ def find_available_subjects(dFC_root, task, run=None, session=None, dFC_id=None)
         if session is None:
             ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/")
         else:
+            if not os.path.exists(f"{dFC_root}/{subj_folder}/{session}/"):
+                continue
             ALL_DFC_FILES = os.listdir(f"{dFC_root}/{subj_folder}/{session}/")
         ALL_DFC_FILES = [
             dFC_file for dFC_file in ALL_DFC_FILES if f"_{task}_" in dFC_file
@@ -1367,125 +1373,6 @@ def task_presence_clustering(
     return clustering_RESULTS, clustering_scores
 
 
-def task_paradigm_clustering(
-    dFC_id,
-    TASKS,
-    RUNS,
-    session,
-    roi_root,
-    dFC_root,
-    normalize_dFC=True,
-):
-    # find SUBJECTS common to all tasks
-    for task_id, task in enumerate(TASKS):
-        if task_id == 0:
-            SUBJECTS = find_available_subjects(
-                dFC_root=dFC_root, task=task, dFC_id=dFC_id
-            )
-        else:
-            SUBJECTS = np.intersect1d(
-                SUBJECTS,
-                find_available_subjects(dFC_root=dFC_root, task=task, dFC_id=dFC_id),
-            )
-    print(f"Number of subjects: {len(SUBJECTS)}")
-
-    X = None
-    y = None
-    subj_label = None
-    measure_name = None
-    for task_id, task in enumerate(TASKS):
-        for run in RUNS[task]:
-            X_new, _, _, _, subj_label_new, _, measure_name_new = dFC_feature_extraction(
-                task=task,
-                train_subjects=SUBJECTS,
-                test_subjects=[],
-                dFC_id=dFC_id,
-                roi_root=roi_root,
-                dFC_root=dFC_root,
-                run=run,
-                session=session,
-                dynamic_pred="no",
-                normalize_dFC=normalize_dFC,
-            )
-
-            # normalize the features
-            X_new = zscore(X_new, axis=0)
-
-            if measure_name is not None:
-                assert measure_name == measure_name_new, "dFC measure is not consistent."
-            else:
-                measure_name = measure_name_new
-
-            y_new = np.ones(X_new.shape[0]) * task_id
-            if X is None and y is None:
-                X = X_new
-                y = y_new
-                subj_label = subj_label_new
-            else:
-                X = np.concatenate((X, X_new), axis=0)
-                y = np.concatenate((y, y_new), axis=0)
-                subj_label = np.concatenate((subj_label, subj_label_new), axis=0)
-
-    assert X.shape[0] == y.shape[0], "Number of samples do not match."
-    assert X.shape[0] == subj_label.shape[0], "Number of samples do not match."
-
-    # rearrange the order of the samples so that the samples of the same subject are together
-    idx = np.argsort(subj_label)
-    X = X[idx, :]
-    y = y[idx]
-    subj_label = subj_label[idx]
-
-    task_paradigm_clstr_RESULTS = {"PCA": {}, "LE": {}}
-    for embedding in ["PCA", "LE"]:
-        # embed dFC features
-        try:
-            X_embed, _ = embed_dFC_features(
-                train_subjects=SUBJECTS,
-                test_subjects=[],
-                X_train=X,
-                X_test=None,
-                y_train=y,
-                y_test=None,
-                subj_label_train=subj_label,
-                subj_label_test=None,
-                embedding=embedding,
-                n_components="auto",
-                n_neighbors_LE=125,
-                LE_embedding_method="embed+procrustes",
-            )
-        except:
-            continue
-
-        # clustering
-        # apply kmeans clustering to dFC features
-
-        n_clusters = len(TASKS)  # corresponding to task paradigms
-
-        scaler = StandardScaler()
-        X_normalized = scaler.fit_transform(X_embed)
-        kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-        labels_pred = kmeans.fit_predict(X_normalized)
-
-        # # visualize clustering centroids
-        # centroids = kmeans.cluster_centers_
-        # centroids = pca.inverse_transform(centroids)
-        # centroids = scaler.inverse_transform(centroids)
-        # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-        # centroids_mat = dFC_vec2mat(centroids, n_regions)
-
-        task_paradigm_clstr_RESULTS[embedding] = {
-            "dFC_method": measure_name,
-            "StandardScaler": scaler,
-            "kmeans": kmeans,
-            "ARI": adjusted_rand_score(y, labels_pred),
-            "SI": silhouette_score(X_normalized, y),
-            # "centroids": centroids_mat,
-            "task_paradigms": TASKS,
-        }
-
-    return task_paradigm_clstr_RESULTS
-
-
 def co_occurrence(task_labels, clstr_labels):
     """
     Calculate the co-occurrence between task labels and clustering labels.
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index f2888a5..9617db8 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -8,7 +8,6 @@
 from pydfc.ml_utils import (
     cluster_for_visual,
     extract_task_features,
-    task_paradigm_clustering,
     task_presence_classification,
     task_presence_clustering,
 )
@@ -188,49 +187,6 @@ def run_clustering(
         np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores)
 
 
-def run_task_paradigm_clustering(
-    dFC_id,
-    TASKS,
-    RUNS,
-    SESSIONS,
-    roi_root,
-    dFC_root,
-    output_root,
-    normalize_dFC=True,
-):
-    for session in SESSIONS:
-
-        try:
-            task_paradigm_clstr_RESULTS = task_paradigm_clustering(
-                dFC_id=dFC_id,
-                TASKS=TASKS,
-                RUNS=RUNS,
-                session=session,
-                roi_root=roi_root,
-                dFC_root=dFC_root,
-                normalize_dFC=normalize_dFC,
-            )
-        except Exception as e:
-            print(f"Error in task paradigm clustering for {session}: {e}")
-            traceback.print_exc()
-            continue
-
-        if session is None:
-            folder = f"{output_root}/task_paradigm_clstr"
-        else:
-            folder = f"{output_root}/task_paradigm_clstr/{session}"
-        try:
-            if not os.path.exists(folder):
-                os.makedirs(folder)
-        except OSError as err:
-            print(err)
-
-        np.save(
-            f"{folder}/task_paradigm_clstr_RESULTS_{dFC_id}.npy",
-            task_paradigm_clstr_RESULTS,
-        )
-
-
 def run_clustering_for_visual(
     dFC_id,
     TASKS,
@@ -414,24 +370,6 @@ def run_clustering_for_visual(
 
     # print(f"Task presence clustering finished for dFC ID {dFC_id}.")
 
-    # print(f"Task paradigm clustering started for dFC ID {dFC_id} ...")
-    # try:
-    #     run_task_paradigm_clustering(
-    #         dFC_id=dFC_id,
-    #         TASKS=TASKS,
-    #         RUNS=RUNS,
-    #         SESSIONS=SESSIONS,
-    #         roi_root=roi_root,
-    #         dFC_root=dFC_root,
-    #         output_root=ML_root,
-    #         normalize_dFC=True,
-    #     )
-    # except Exception as e:
-    #     print(f"Error in task paradigm clustering for dFC ID {dFC_id}: {e}")
-    #     traceback.print_exc()
-
-    # print(f"Task paradigm clustering finished for dFC ID {dFC_id}.")
-
     print(f"Clustering for visualization started for dFC ID {dFC_id} ...")
     try:
         run_clustering_for_visual(

From d588a4367795b956ee71e53e5f0d3b4303d3aeb5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 16 May 2025 18:31:25 -0400
Subject: [PATCH 193/401] implement SVM_classify

---
 pydfc/ml_utils.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 1c3bf43..cb96f16 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -31,6 +31,7 @@
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
 
 from .dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm
 from .task_utils import (
@@ -954,6 +955,40 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
     return RESULT
 
 
+def SVM_classify(X_train, y_train, X_test, y_test):
+    """
+    SVM classification
+    """
+    # define the parameter grid
+    param_grid = {
+        "svc__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+        "svc__gamma": [0.0001, 0.001, 0.01, 1, 10, 100, 1000],
+    }
+
+    # perform grid search
+    model_for_hyperparam = make_pipeline(
+        StandardScaler(),
+        SVC(kernel="rbf", class_weight={0: 1, 1: 10}),
+    )
+    model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=3, n_jobs=-1)
+    model_gscv.fit(X_train, y_train)
+    C = model_gscv.best_params_["svc__C"]
+    gamma = model_gscv.best_params_["svc__gamma"]
+
+    model = make_pipeline(
+        StandardScaler(),
+        SVC(kernel="rbf", C=C, gamma=gamma, class_weight={0: 1, 1: 10}),
+    ).fit(X_train, y_train)
+
+    RESULT = {
+        "SVC_cv_results": model_gscv.cv_results_,
+        "SVC_model": model,
+        "SVC_train_score": model.score(X_train, y_train),
+        "SVC_test_score": model.score(X_test, y_test),
+    }
+    return RESULT
+
+
 def KNN_classify(X_train, y_train, X_test, y_test):
     """
     KNN classification

From 1dff794b9b1c611a8d891d7ec77eeaba9c23cdb2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 21 May 2025 13:59:04 -0400
Subject: [PATCH 194/401] modify SVC

---
 pydfc/ml_utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index cb96f16..fa91221 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -959,6 +959,16 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     """
     SVM classification
     """
+
+    # set class weights so the task has 10 times more weight than the rest
+    # considering also their number of samples
+    task_count = np.sum(y_train == 1)
+    rest_count = np.sum(y_train == 0)
+    if task_count > rest_count:
+        class_weight = {0: 1, 1: 10}
+    else:
+        class_weight = {0: 1, 1: int(rest_count / task_count) * 10}
+
     # define the parameter grid
     param_grid = {
         "svc__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
@@ -968,7 +978,7 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     # perform grid search
     model_for_hyperparam = make_pipeline(
         StandardScaler(),
-        SVC(kernel="rbf", class_weight={0: 1, 1: 10}),
+        SVC(kernel="rbf", class_weight=class_weight),
     )
     model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=3, n_jobs=-1)
     model_gscv.fit(X_train, y_train)
@@ -977,7 +987,7 @@ def SVM_classify(X_train, y_train, X_test, y_test):
 
     model = make_pipeline(
         StandardScaler(),
-        SVC(kernel="rbf", C=C, gamma=gamma, class_weight={0: 1, 1: 10}),
+        SVC(kernel="rbf", C=C, gamma=gamma, class_weight=class_weight),
     ).fit(X_train, y_train)
 
     RESULT = {

From 56c3adb96c29537231e33283205a1bf8dd1e9aa5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 21 May 2025 21:46:49 -0400
Subject: [PATCH 195/401] replace KNN with SVM

---
 pydfc/ml_utils.py           | 39 +++++++++++++------------------------
 task_dFC/generate_report.py |  6 ++++--
 2 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index fa91221..5644195 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -991,10 +991,10 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     ).fit(X_train, y_train)
 
     RESULT = {
-        "SVC_cv_results": model_gscv.cv_results_,
-        "SVC_model": model,
-        "SVC_train_score": model.score(X_train, y_train),
-        "SVC_test_score": model.score(X_test, y_test),
+        "SVM_cv_results": model_gscv.cv_results_,
+        "SVM_model": model,
+        "SVM_train_score": model.score(X_train, y_train),
+        "SVM_test_score": model.score(X_test, y_test),
     }
     return RESULT
 
@@ -1122,7 +1122,7 @@ def task_presence_classification(
     train_test_ratio=0.8,
 ):
     """
-    perform task presence classification using logistic regression, KNN, Random Forest, Gradient Boosting
+    perform task presence classification using logistic regression, SVM, KNN, Random Forest, Gradient Boosting
     for a given task and dFC method and run.
     """
     if run is None:
@@ -1200,35 +1200,22 @@ def task_presence_classification(
             X_train_embedded, y_train, X_test_embedded, y_test
         )
 
-        # KNN
-        KNN_RESULT = KNN_classify(X_train_embedded, y_train, X_test_embedded, y_test)
+        # SVM
+        SVM_RESULT = SVM_classify(X_train_embedded, y_train, X_test_embedded, y_test)
 
-        # # Random Forest
-        # RF_RESULT = random_forest_classify(
-        #     X_train_embedded, y_train, X_test_embedded, y_test
-        # )
-
-        # # Gradient Boosting
-        # GBT_RESULT = gradient_boosting_classify(
-        #     X_train_embedded, y_train, X_test_embedded, y_test
-        # )
+        # # KNN
+        # KNN_RESULT = KNN_classify(X_train_embedded, y_train, X_test_embedded, y_test)
 
         for key in log_reg_RESULT:
             ML_RESULT[embedding][key] = log_reg_RESULT[key]
-        for key in KNN_RESULT:
-            ML_RESULT[embedding][key] = KNN_RESULT[key]
-        # for key in RF_RESULT:
-        #     ML_RESULT[embedding][key] = RF_RESULT[key]
-        # for key in GBT_RESULT:
-        #     ML_RESULT[embedding][key] = GBT_RESULT[key]
+        for key in SVM_RESULT:
+            ML_RESULT[embedding][key] = SVM_RESULT[key]
 
         # measure pred score on each subj
         log_reg = log_reg_RESULT["log_reg_model"]
-        KNN = KNN_RESULT["KNN_model"]
-        # RF = RF_RESULT["RF_model"]
-        # GBT = GBT_RESULT["GB_model"]
+        SVM = SVM_RESULT["SVM_model"]
 
-        ML_models = {"Logistic regression": log_reg, "KNN": KNN}
+        ML_models = {"Logistic regression": log_reg, "SVM": SVM}
 
         for subj in SUBJECTS:
             ML_scores["subj_id"].append(subj)
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index a39559d..66fa455 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -642,6 +642,8 @@ def plot_classification_results(
 
     if ML_algorithm == "Logistic regression":
         ML_algorithm_name = "LogReg"
+    elif ML_algorithm == "SVM":
+        ML_algorithm_name = "SVM"
     elif ML_algorithm == "KNN":
         ML_algorithm_name = "KNN"
     elif ML_algorithm == "Random Forest":
@@ -1570,7 +1572,7 @@ def create_html_report_group_results(
         "fn",
         "average precision",
     ]
-    classification_models = {"LogReg": "Logistic Regression", "KNN": "KNN"}
+    classification_models = {"LogReg": "Logistic Regression", "SVM": "SVM"}
     img_height = 300
     file.write("<h1>Classification Results</h1>\n")
     for session in SESSIONS:
@@ -2105,7 +2107,7 @@ def create_html_report_group_results(
         for task in TASKS:
             for run in RUNS[task]:
                 for embedding in ["PCA", "LE"]:
-                    for ML_algorithm in ["KNN", "Logistic regression"]:
+                    for ML_algorithm in ["SVM", "Logistic regression"]:
                         try:
                             plot_classification_results(
                                 ML_root=ML_root,

From be3cae171349cc0479a98dc14651e087394852e0 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 22 May 2025 19:54:55 -0400
Subject: [PATCH 196/401] add clstr_distance to methods_config

---
 task_dFC/run_scripts_slurm/methods_config.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index 0fc455f..8ff76f2 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -6,6 +6,7 @@
         "tapered_window": true,
         "TF_method": "WTC",
         "clstr_base_measure": "SlidingWindow",
+        "clstr_distance": "euclidean",
         "hmm_iter": 20,
         "dhmm_obs_state_ratio": 0.666,
         "n_states": 5,

From c067737412482be5aabfd5d0df67a3d7347caeb6 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 22 May 2025 23:47:41 -0400
Subject: [PATCH 197/401] introduce extract_abs_task_presence

---
 pydfc/ml_utils.py   | 25 ++++++++++++++++++++-----
 pydfc/task_utils.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 5644195..da32b8f 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -39,6 +39,7 @@
     calc_rest_duration,
     calc_task_duration,
     calc_transition_freq,
+    extract_abs_task_presence,
     extract_task_presence,
 )
 
@@ -241,17 +242,31 @@ def dFC_feature_extraction_subj_lvl(
     dFC_vecs = dFC_mat2vec(dFC_mat)
 
     # event data
-    task_presence = extract_task_presence(
+    # task_presence = extract_task_presence(
+    #     event_labels=task_data["event_labels"],
+    #     TR_task=1 / task_data["Fs_task"],
+    #     TR_mri=task_data["TR_mri"],
+    #     TR_array=TR_array,
+    #     binary=True,
+    #     binarizing_method="shift",
+    # )
+    abs_task_presence, indices = extract_abs_task_presence(
         event_labels=task_data["event_labels"],
         TR_task=1 / task_data["Fs_task"],
         TR_mri=task_data["TR_mri"],
         TR_array=TR_array,
-        binary=True,
-        binarizing_method="shift",
     )
 
-    features = dFC_vecs
-    target = task_presence.ravel()
+    # features = dFC_vecs
+    # target = task_presence.ravel()
+
+    # use absolute task presence
+    features = dFC_vecs[indices, :]
+    target = abs_task_presence.ravel()
+
+    assert (
+        features.shape[0] == target.shape[0]
+    ), "Features and target have different number of samples."
 
     if dynamic_pred == "past":
         # concat current TR and two TR before of features to predict the current TR of target
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 1e61bdc..bbdff4e 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -351,6 +351,49 @@ def extract_task_presence(
     return task_presence
 
 
+def extract_abs_task_presence(
+    event_labels,
+    TR_task,
+    TR_mri,
+    TR_array=None,
+):
+    """
+    event_labels: event labels including 0 and event ids at the time each event happens
+    TR_task: TR of task
+    TR_mri: TR of MRI
+    TR_array: the time points of the dFC data, optional
+
+    This function considers time points above task_presence_shift as task presence
+    and time points below task_presence_shift as rest and discards the ones in the
+    grey area between them. It also returns the indices of time points that are
+    kept.
+    """
+    task_presence_mean = extract_task_presence(
+        event_labels=event_labels,
+        TR_task=TR_task,
+        TR_mri=TR_mri,
+        TR_array=TR_array,
+        binary=True,
+        binarizing_method="mean",
+        no_hrf=False,
+    )
+    task_presence_shift = extract_task_presence(
+        event_labels=event_labels,
+        TR_task=TR_task,
+        TR_mri=TR_mri,
+        TR_array=TR_array,
+        binary=True,
+        binarizing_method="shift",
+        no_hrf=False,
+    )
+    indices = np.where((task_presence_mean == 0) | (task_presence_shift == 1))[0]
+
+    abs_task_presence = task_presence_shift.copy()
+    abs_task_presence = abs_task_presence[indices]
+
+    return abs_task_presence, indices
+
+
 ################################# Task Features ####################################
 
 

From 402e54b84a016433658a67a71a390d0ebc57bb66 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 23 May 2025 22:58:55 -0400
Subject: [PATCH 198/401] add SI to ML_scores

---
 pydfc/ml_utils.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index da32b8f..263491a 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -974,16 +974,6 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     """
     SVM classification
     """
-
-    # set class weights so the task has 10 times more weight than the rest
-    # considering also their number of samples
-    task_count = np.sum(y_train == 1)
-    rest_count = np.sum(y_train == 0)
-    if task_count > rest_count:
-        class_weight = {0: 1, 1: 10}
-    else:
-        class_weight = {0: 1, 1: int(rest_count / task_count) * 10}
-
     # define the parameter grid
     param_grid = {
         "svc__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
@@ -993,7 +983,7 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     # perform grid search
     model_for_hyperparam = make_pipeline(
         StandardScaler(),
-        SVC(kernel="rbf", class_weight=class_weight),
+        SVC(kernel="rbf"),
     )
     model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=3, n_jobs=-1)
     model_gscv.fit(X_train, y_train)
@@ -1002,7 +992,7 @@ def SVM_classify(X_train, y_train, X_test, y_test):
 
     model = make_pipeline(
         StandardScaler(),
-        SVC(kernel="rbf", C=C, gamma=gamma, class_weight=class_weight),
+        SVC(kernel="rbf", C=C, gamma=gamma),
     ).fit(X_train, y_train)
 
     RESULT = {
@@ -1181,6 +1171,7 @@ def task_presence_classification(
     ML_scores = {
         "subj_id": list(),
         "group": list(),
+        "SI": list(),
         "task": list(),
         "run": list(),
         "dFC method": list(),
@@ -1243,6 +1234,8 @@ def task_presence_classification(
                 features = X_test_embedded[subj_label_test == subj, :]
                 target = y_test[subj_label_test == subj]
 
+            # Silhouette score
+            ML_scores["SI"].append(silhouette_score(features, target))
             # measure pred score using different metrics on each subj
             for model_name, model in ML_models.items():
                 pred = model.predict(features)

From 1da218f1c3f2eda5cbfe9cba670c24571b33ac7e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 26 May 2025 12:05:03 -0400
Subject: [PATCH 199/401] update report

---
 task_dFC/generate_report.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 66fa455..a2e7dfc 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -657,16 +657,16 @@ def plot_classification_results(
         suffix = f"{ML_algorithm_name}_{task}_{run}_{embedding}"
 
     metrics = [
-        "accuracy",
+        # "accuracy",
         "balanced accuracy",
         "precision",
         "recall",
         "f1",
-        "tp",
-        "tn",
-        "fp",
-        "fn",
-        "average precision",
+        # "tp",
+        # "tn",
+        # "fp",
+        # "fn",
+        # "average precision",
     ]
 
     for metric in metrics:
@@ -1611,7 +1611,7 @@ def create_html_report_group_results(
                                     f"<img src='{classification_img}' alt='Classification results' width='{width}' height='{img_height}'>\n"
                                 )
 
-                            file.write("<br>\n")
+                        file.write("<br>\n")
 
     # clustering results
     img_height = 300

From 02e0fc82a808fe2b9816bd32b0810db65554661e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 26 May 2025 12:39:59 -0400
Subject: [PATCH 200/401] add SI to report

---
 pydfc/report_util.py        |  45 +++
 task_dFC/generate_report.py | 541 ++++--------------------------------
 2 files changed, 92 insertions(+), 494 deletions(-)

diff --git a/pydfc/report_util.py b/pydfc/report_util.py
index 53d9a49..82f8d82 100644
--- a/pydfc/report_util.py
+++ b/pydfc/report_util.py
@@ -78,3 +78,48 @@ def plot_classification_metrics(
     )
 
     plt.close()
+
+
+def plot_clustering_metrics(dataframe, metric, title, suffix, output_dir):
+    """
+    This function plots these metrics:
+    - SI
+    """
+
+    plt.figure(figsize=(10, 5))
+
+    g = sns.pointplot(
+        data=dataframe,
+        x="dFC method",
+        y=f"{metric}",
+        hue="group",
+        hue_order=["train", "test"],
+        errorbar="sd",
+        linestyle="none",
+        dodge=True,
+        capsize=0.1,
+    )
+    plt.xlabel(g.get_xlabel(), fontweight="bold")
+    plt.ylabel(g.get_ylabel(), fontweight="bold")
+    plt.xticks(fontweight="bold")
+    plt.yticks(fontweight="bold")
+
+    # set the y-axis upper limit to 1, but not set the lower limit
+    g.set(ylim=(None, 1))
+
+    if show_title:
+        g.set_title(title, fontdict={"fontsize": 10, "fontweight": "bold"})
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    metric_no_space = metric.replace(" ", "_")
+    plt.savefig(
+        f"{output_dir}/clustering_{metric_no_space}_{suffix}.{save_fig_format}",
+        dpi=fig_dpi,
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+        format=save_fig_format,
+    )
+
+    plt.close()
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index a2e7dfc..66c739b 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -20,7 +20,7 @@
     rank_norm,
     visualize_conn_mat_dict,
 )
-from pydfc.report_util import plot_classification_metrics
+from pydfc.report_util import plot_classification_metrics, plot_clustering_metrics
 
 ################################# Parameters ####################################
 
@@ -582,17 +582,17 @@ def plot_dFC_matrices(
         )
 
 
-def plot_classification_results(
+def plot_ML_results(
     ML_root,
     output_root,
     task,
     run=None,
     session=None,
-    ML_algorithm="KNN",
+    ML_algorithms=["KNN"],
     embedding="PCA",
 ):
     """
-    Plot the ML classification results for a given task, run and session.
+    Plot the ML classification results plus SI score for a given task, run and session.
     parameters:
     ----------
         ML_root: str, path to ML results
@@ -600,7 +600,7 @@ def plot_classification_results(
         task: str, task name
         run: int, run number
         session: str, session name
-        ML_algorithm: str, ML algorithm name (default: Random Forest, other options: Logistic regression, KNN, Gradient Boosting)
+        ML_algorithms: list of str, list of ML algorithm name (default: KNN, other options: Logistic regression, SVM, Gradient Boosting, RF)
         embedding: str, embedding method (default: PCA, other options: LE)
     """
     # the ML_scores files are saved as ML_scores_classify_{dFC_id}.npy
@@ -640,22 +640,6 @@ def plot_classification_results(
     else:
         output_dir = f"{output_root}/group_results/classification/{session}"
 
-    if ML_algorithm == "Logistic regression":
-        ML_algorithm_name = "LogReg"
-    elif ML_algorithm == "SVM":
-        ML_algorithm_name = "SVM"
-    elif ML_algorithm == "KNN":
-        ML_algorithm_name = "KNN"
-    elif ML_algorithm == "Random Forest":
-        ML_algorithm_name = "RF"
-    elif ML_algorithm == "Gradient Boosting":
-        ML_algorithm_name = "GBT"
-
-    if run is None:
-        suffix = f"{ML_algorithm_name}_{task}_{embedding}"
-    else:
-        suffix = f"{ML_algorithm_name}_{task}_{run}_{embedding}"
-
     metrics = [
         # "accuracy",
         "balanced accuracy",
@@ -669,300 +653,53 @@ def plot_classification_results(
         # "average precision",
     ]
 
-    for metric in metrics:
-        plot_classification_metrics(
-            dataframe=dataframe,
-            ML_algorithm=ML_algorithm,
-            pred_metric=metric,
-            title=task,
-            suffix=suffix,
-            output_dir=output_dir,
-        )
-
+    for ML_algorithm in ML_algorithms:
+        if ML_algorithm == "Logistic regression":
+            ML_algorithm_name = "LogReg"
+        elif ML_algorithm == "SVM":
+            ML_algorithm_name = "SVM"
+        elif ML_algorithm == "KNN":
+            ML_algorithm_name = "KNN"
+        elif ML_algorithm == "Random Forest":
+            ML_algorithm_name = "RF"
+        elif ML_algorithm == "Gradient Boosting":
+            ML_algorithm_name = "GBT"
 
-def plot_clustering_results(
-    ML_root, output_root, task, run=None, session=None, embedding="PCA"
-):
-    """
-    Plot the clustering results for a given task, run and session.
-    parameters:
-    ----------
-        ML_root: str, path to ML results
-        output_root: str, path to save the figures
-        task: str, task name
-        run: int, run number
-        session: str, session name
-        embedding: str, embedding method (default: PCA, other options: LE)
-    """
-    # the clustering_scores files are saved as clustering_scores_{dFC_id}.npy
-    # find all the clustering_scores files in the directory
-    if session is None:
-        input_dir = f"{ML_root}/clustering"
-    else:
-        input_dir = f"{ML_root}/clustering/{session}"
-    ALL_CLUSTERING_SCORES = os.listdir(input_dir)
-    ALL_CLUSTERING_SCORES = [
-        score_file
-        for score_file in ALL_CLUSTERING_SCORES
-        if "clustering_scores" in score_file
-    ]
-    ALL_CLUSTERING_SCORES.sort()
-    clustering_scores = None
-    for score_file in ALL_CLUSTERING_SCORES:
-        clustering_scores_new = np.load(
-            f"{input_dir}/{score_file}", allow_pickle="TRUE"
-        ).item()
-        if clustering_scores is None:
-            clustering_scores = clustering_scores_new
+        if run is None:
+            suffix = f"{ML_algorithm_name}_{task}_{embedding}"
         else:
-            for key in clustering_scores_new.keys():
-                clustering_scores[key].extend(clustering_scores_new[key])
-
-    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
-
-    sns.set_style("darkgrid")
-
-    dataframe = pd.DataFrame(clustering_scores)
-    if run is not None:
-        dataframe = dataframe[dataframe["run"] == run]
-
-    dataframe = dataframe[dataframe["task"] == task]
-    dataframe = dataframe[dataframe["embedding"] == embedding]
+            suffix = f"{ML_algorithm_name}_{task}_{run}_{embedding}"
+
+        for metric in metrics:
+            plot_classification_metrics(
+                dataframe=dataframe,
+                ML_algorithm=ML_algorithm,
+                pred_metric=metric,
+                title=task,
+                suffix=suffix,
+                output_dir=output_dir,
+            )
 
-    # plot ARI score
-    plt.figure(figsize=(10, 5))
-    g = sns.pointplot(
-        data=dataframe,
-        x="dFC method",
-        y="Kmeans ARI",
-        errorbar="sd",
-        linestyle="none",
-        dodge=True,
-        capsize=0.1,
-    )
-    plt.xlabel(g.get_xlabel(), fontweight="bold")
-    plt.ylabel(g.get_ylabel(), fontweight="bold")
-    plt.xticks(fontweight="bold")
-    plt.yticks(fontweight="bold")
-    g.axhline(0.0, color="r", linestyle="--")
-    # set the y-axis upper limit to 1, but not set the lower limit
-    g.set(ylim=(None, 1))
-    if show_title:
-        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
+    # Clustering SI score
 
     # save the figure
-    if session is None:
-        output_dir = f"{output_root}/group_results/clustering"
-    else:
-        output_dir = f"{output_root}/group_results/clustering/{session}"
-
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
     if run is None:
-        plt.savefig(
-            f"{output_dir}/clustering_results_ARI_{task}_{embedding}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
+        suffix = f"{task}_{embedding}"
     else:
-        plt.savefig(
-            f"{output_dir}/clustering_results_ARI_{task}_{run}_{embedding}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
-
-    plt.close()
+        suffix = f"{task}_{run}_{embedding}"
 
-    # plot SI score
-    plt.figure(figsize=(10, 5))
-    g = sns.pointplot(
-        data=dataframe,
-        x="dFC method",
-        y="SI",
-        errorbar="sd",
-        linestyle="none",
-        dodge=True,
-        capsize=0.1,
-    )
-    plt.xlabel(g.get_xlabel(), fontweight="bold")
-    plt.ylabel(g.get_ylabel(), fontweight="bold")
-    plt.xticks(fontweight="bold")
-    plt.yticks(fontweight="bold")
-    # set the y-axis upper limit to 1, but not set the lower limit
-    g.set(ylim=(None, 1))
-    if show_title:
-        g.set_title(task, fontdict={"fontsize": 10, "fontweight": "bold"})
-    # save the figure
     if session is None:
         output_dir = f"{output_root}/group_results/clustering"
     else:
         output_dir = f"{output_root}/group_results/clustering/{session}"
 
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    if run is None:
-        plt.savefig(
-            f"{output_dir}/clustering_results_SI_{task}_{embedding}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
-    else:
-        plt.savefig(
-            f"{output_dir}/clustering_results_SI_{task}_{run}_{embedding}.{save_fig_format}",
-            dpi=fig_dpi,
-            bbox_inches=fig_bbox_inches,
-            pad_inches=fig_pad,
-            format=save_fig_format,
-        )
-
-    plt.close()
-
-
-def plot_paradigm_clustering_score(
-    ML_root,
-    output_root,
-    session=None,
-    embedding="PCA",
-):
-    """
-    Plot the clustering results for a given task, run and session.
-    parameters:
-    ----------
-        ML_root: str, path to ML results
-        output_root: str, path to save the figures
-        task: str, task name
-        run: int, run number
-        session: str, session name
-        embedding: str, embedding method (default: PCA, other options: LE)
-    """
-    # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
-    # find all the paradigm_clustering_RESULTS files in the directory
-    if session is None:
-        input_dir = f"{ML_root}/task_paradigm_clstr"
-    else:
-        input_dir = f"{ML_root}/task_paradigm_clstr/{session}"
-    ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
-    ALL_PARADIGM_CLUSTERING_RESULTS = [
-        result_file
-        for result_file in ALL_PARADIGM_CLUSTERING_RESULTS
-        if "task_paradigm_clstr_RESULTS_" in result_file
-    ]
-    ALL_PARADIGM_CLUSTERING_RESULTS.sort()
-    paradigm_clustering_RESULTS = {
-        "dFC method": [],
-        "ARI score": [],
-        "SI score": [],
-    }
-    for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
-        paradigm_clustering_RESULTS_new = np.load(
-            f"{input_dir}/{result_file}", allow_pickle="TRUE"
-        ).item()
-        paradigm_clustering_RESULTS["dFC method"].append(
-            paradigm_clustering_RESULTS_new[embedding]["dFC_method"]
-        )
-        paradigm_clustering_RESULTS["ARI score"].append(
-            paradigm_clustering_RESULTS_new[embedding]["ARI"]
-        )
-        paradigm_clustering_RESULTS["SI score"].append(
-            paradigm_clustering_RESULTS_new[embedding]["SI"]
-        )
-
-    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.0})
-
-    sns.set_style("darkgrid")
-
-    dataframe = pd.DataFrame(paradigm_clustering_RESULTS)
-
-    # plot ARI score
-    plt.figure(figsize=(10, 5))
-    g = sns.pointplot(
-        data=dataframe,
-        x="dFC method",
-        y="ARI score",
-        linestyle="none",
-        dodge=True,
-        capsize=0.1,
+    plot_clustering_metrics(
+        dataframe=dataframe,
+        metric="SI",
+        title=task,
+        suffix=suffix,
+        output_dir=output_dir,
     )
-    plt.xlabel(g.get_xlabel(), fontweight="bold")
-    plt.ylabel(g.get_ylabel(), fontweight="bold")
-    plt.xticks(fontweight="bold")
-    plt.yticks(fontweight="bold")
-    g.axhline(0.0, color="r", linestyle="--")
-    # set the y-axis upper limit to 1, but not set the lower limit
-    g.set(ylim=(None, 1))
-    if show_title:
-        g.set_title(
-            "Task Paradigm Clustering Performance",
-            fontdict={"fontsize": 10, "fontweight": "bold"},
-        )
-
-    # save the figure
-    if session is None:
-        output_dir = f"{output_root}/group_results/paradigm_clustering"
-    else:
-        output_dir = f"{output_root}/group_results/paradigm_clustering/{session}"
-
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    plt.savefig(
-        f"{output_dir}/paradigm_clustering_results_ARI_{embedding}.{save_fig_format}",
-        dpi=fig_dpi,
-        bbox_inches=fig_bbox_inches,
-        pad_inches=fig_pad,
-        format=save_fig_format,
-    )
-
-    plt.close()
-
-    # plot SI score
-    plt.figure(figsize=(10, 5))
-    g = sns.pointplot(
-        data=dataframe,
-        x="dFC method",
-        y="SI score",
-        linestyle="none",
-        dodge=True,
-        capsize=0.1,
-    )
-    plt.xlabel(g.get_xlabel(), fontweight="bold")
-    plt.ylabel(g.get_ylabel(), fontweight="bold")
-    plt.xticks(fontweight="bold")
-    plt.yticks(fontweight="bold")
-    # set the y-axis upper limit to 1, but not set the lower limit
-    g.set(ylim=(None, 1))
-    if show_title:
-        g.set_title(
-            "Task Paradigm Clustering Performance",
-            fontdict={"fontsize": 10, "fontweight": "bold"},
-        )
-
-    # save the figure
-    if session is None:
-        output_dir = f"{output_root}/group_results/paradigm_clustering"
-    else:
-        output_dir = f"{output_root}/group_results/paradigm_clustering/{session}"
-
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    plt.savefig(
-        f"{output_dir}/paradigm_clustering_results_SI_{embedding}.{save_fig_format}",
-        dpi=fig_dpi,
-        bbox_inches=fig_bbox_inches,
-        pad_inches=fig_pad,
-        format=save_fig_format,
-    )
-
-    plt.close()
 
 
 def plot_visual_clstr_centroids(
@@ -1092,156 +829,6 @@ def plot_visual_clstr_centroids(
         plt.close()
 
 
-# def plot_paradigm_clstr_centroids(
-#     ML_root,
-#     output_root,
-#     session=None,
-# ):
-#     """ """
-#     # the paradigm_clustering_RESULTS files are saved as task_paradigm_clstr_RESULTS_{dFC_id}.npy
-#     # find all the paradigm_clustering_RESULTS files in the directory
-#     if session is None:
-#         input_dir = f"{ML_root}"
-#     else:
-#         input_dir = f"{ML_root}/{session}"
-
-#     if session is None:
-#         output_dir = f"{output_root}/group_results/paradigm_clustering_centroids"
-#     else:
-#         output_dir = (
-#             f"{output_root}/group_results/paradigm_clustering_centroids/{session}"
-#         )
-
-#     if not os.path.exists(output_dir):
-#         os.makedirs(output_dir)
-
-#     ALL_PARADIGM_CLUSTERING_RESULTS = os.listdir(input_dir)
-#     ALL_PARADIGM_CLUSTERING_RESULTS = [
-#         result_file
-#         for result_file in ALL_PARADIGM_CLUSTERING_RESULTS
-#         if "task_paradigm_clstr_RESULTS_" in result_file
-#     ]
-#     ALL_PARADIGM_CLUSTERING_RESULTS.sort()
-
-#     for result_file in ALL_PARADIGM_CLUSTERING_RESULTS:
-#         paradigm_clustering_RESULTS_new = np.load(
-#             f"{input_dir}/{result_file}", allow_pickle="TRUE"
-#         ).item()
-
-#         measure_name = paradigm_clustering_RESULTS_new["dFC_method"]
-#         centroids_mats = paradigm_clustering_RESULTS_new["centroids"]
-
-#         centroids_dict = {}
-#         for i, centroid_mat in enumerate(centroids_mats):
-#             centroids_dict[f"Cluster {i + 1}"] = centroid_mat
-
-#         visualize_conn_mat_dict(
-#             data=centroids_dict,
-#             title=f"Task Paradigm Centroids {measure_name}",
-#             cmap="seismic",
-#             normalize=True,
-#             disp_diag=False,
-#             save_image=True,
-#             output_root=f"{output_dir}/",
-#             center_0=True,
-#             # node_networks=None,
-#         )
-
-
-# def plot_dFC_clustering(
-#     dFC_root,
-#     subj,
-#     task,
-#     start_time,
-#     end_time,
-#     output_root,
-#     run=None,
-#     session=None,
-#     normalize_dFC=True,
-# ):
-#     task_data = load_task_data(roi_root, subj, task, run, session)
-#     TR_mri = task_data["TR_mri"]
-
-#     for dFC_id in range(
-#         0, 20
-#     ):  # change this to the number of dFCs you have or right a function that finds available dFC ids
-#         try:
-#             dFC = load_dFC(dFC_root, subj, task, dFC_id, run, session)
-#         except Exception:
-#             pass
-
-#         dFC_mat = dFC.get_dFC_mat()
-#         TR_array = dFC.TR_array
-#         if normalize_dFC:
-#             dFC_mat = rank_norm(dFC_mat)
-#         dFC_vecs = dFC_mat2vec(dFC_mat)
-
-#         if session is None:
-#             clustering_RESULTS = np.load(
-#                 f"{ML_root}/clustering_RESULTS_{dFC_id}.npy", allow_pickle="TRUE"
-#             ).item()
-#         else:
-#             clustering_RESULTS = np.load(
-#                 f"{ML_root}/{session}/clustering_RESULTS_{dFC_id}.npy",
-#                 allow_pickle="TRUE",
-#             ).item()
-
-#         if run is None:
-#             scaler = clustering_RESULTS[task]["StandardScaler"]
-#             pca = clustering_RESULTS[task]["PCA"]
-#             kmeans = clustering_RESULTS[task]["kmeans"]
-#         else:
-#             scaler = clustering_RESULTS[task][run]["StandardScaler"]
-#             pca = clustering_RESULTS[task][run]["PCA"]
-#             kmeans = clustering_RESULTS[task][run]["kmeans"]
-
-#         dFC_vecs_normalized = scaler.transform(dFC_vecs)
-#         dFC_vecs_pca = pca.transform(dFC_vecs_normalized)
-#         cluster_labels = kmeans.predict(dFC_vecs_pca)
-
-#         start_TR = int(start_time / TR_mri)
-#         end_TR = int(end_time / TR_mri)
-
-#         start_TR_idx = np.where(np.array(TR_array) >= start_TR)[0][0]
-#         end_TR_idx = np.where(np.array(TR_array) <= end_TR)[0][-1]
-
-#         fig_width = int(2.5 * (end_time - start_time) / 2)
-#         fig_width = min(fig_width, 500)
-#         plt.figure(figsize=(fig_width, 5))
-#         time = TR_array[start_TR_idx:end_TR_idx] * TR_mri
-#         plt.plot(
-#             time[start_TR:end_TR], cluster_labels[start_TR_idx:end_TR_idx], linewidth=4
-#         )
-#         # put vertical lines at the start of each TR
-#         for t in time:
-#             plt.axvline(x=t, color="r", linestyle="--")
-#             # plt.text(t, 0.5, f"TR {int(t/TR_mri)}", fontsize=8, color='black', ha='center')
-#         plt.title(f"Cluster labels of {dFC.measure.measure_name}")
-#         plt.xlabel("Time (s)")
-
-#         # save the figure
-#         output_dir = f"{output_root}/subject_results/{subj}/dFC_clustering"
-#         if session is not None:
-#             output_dir = f"{output_dir}/{session}"
-#         output_dir = f"{output_dir}/{task}"
-#         if run is not None:
-#             output_dir = f"{output_dir}/{run}"
-#         output_dir = f"{output_dir}/"
-
-#         if not os.path.exists(output_dir):
-#             os.makedirs(output_dir)
-
-#         plt.savefig(
-#             f"{output_dir}/dFC_clustering_{dFC.measure.measure_name}.{save_fig_format}",
-#             dpi=fig_dpi,
-#             bbox_inches=fig_bbox_inches,
-#             pad_inches=fig_pad,
-#             format=save_fig_format,
-#         )
-
-#         plt.close()
-
-
 def plot_task_presence_features(
     ML_root,
     output_root,
@@ -1561,16 +1148,16 @@ def create_html_report_group_results(
 
     # classification results
     metrics = [
-        "accuracy",
+        # "accuracy",
         "balanced accuracy",
         "precision",
         "recall",
         "f1",
-        "tp",
-        "tn",
-        "fp",
-        "fn",
-        "average precision",
+        # "tp",
+        # "tn",
+        # "fp",
+        # "fn",
+        # "average precision",
     ]
     classification_models = {"LogReg": "Logistic Regression", "SVM": "SVM"}
     img_height = 300
@@ -2075,17 +1662,6 @@ def create_html_report_group_results(
         except Exception as e:
             print(f"Error in plotting task presence features: {e}")
 
-        for embedding in ["PCA", "LE"]:
-            try:
-                plot_paradigm_clustering_score(
-                    ML_root=ML_root,
-                    output_root=reports_root,
-                    session=session,
-                    embedding=embedding,
-                )
-            except Exception as e:
-                print(f"Error in plotting paradigm clustering scores: {e}")
-
         try:
             plot_visual_clstr_centroids(
                 ML_root=ML_root,
@@ -2095,44 +1671,21 @@ def create_html_report_group_results(
         except Exception as e:
             print(f"Error in plotting visual clustering centroids: {e}")
 
-        # try:
-        #     plot_paradigm_clstr_centroids(
-        #         ML_root=ML_root,
-        #         output_root=reports_root,
-        #         session=session,
-        #     )
-        # except Exception as e:
-        #     print(f"Error in plotting paradigm clustering centroids: {e}")
-
         for task in TASKS:
             for run in RUNS[task]:
                 for embedding in ["PCA", "LE"]:
-                    for ML_algorithm in ["SVM", "Logistic regression"]:
-                        try:
-                            plot_classification_results(
-                                ML_root=ML_root,
-                                output_root=reports_root,
-                                task=task,
-                                run=run,
-                                session=session,
-                                ML_algorithm=ML_algorithm,
-                                embedding=embedding,
-                            )
-                        except Exception as e:
-                            print(
-                                f"Error in plotting ML results for {ML_algorithm} and {embedding}: {e}"
-                            )
                     try:
-                        plot_clustering_results(
+                        plot_ML_results(
                             ML_root=ML_root,
                             output_root=reports_root,
                             task=task,
                             run=run,
                             session=session,
+                            ML_algorithm=["SVM", "Logistic regression"],
                             embedding=embedding,
                         )
                     except Exception as e:
-                        print(f"Error in plotting clustering results: {e}")
+                        print(f"Error in plotting ML results for {embedding}: {e}")
 
     # create html report
     try:

From b85eff22b4d5831247db5b1a5866e19930f7687c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 26 May 2025 12:48:32 -0400
Subject: [PATCH 201/401] minor

---
 task_dFC/generate_report.py | 128 ++----------------------------------
 1 file changed, 5 insertions(+), 123 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 66c739b..f2a36a9 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1058,28 +1058,6 @@ def create_html_report_subj_results(
                             )
                             file.write("<br>\n")
 
-                # # display dFC clustering
-                # img_height = 100
-                # # for dFC matrices find all png files in the directory
-                # dFC_clustering_dir = f"{subj_dir}/dFC_clustering/{session_task_run_dir}"
-                # if os.path.exists(dFC_clustering_dir):
-                #     for file_name in os.listdir(dFC_clustering_dir):
-                #         if file_name.endswith(".png"):
-                #             file.write(
-                #                 f"<h3>{file_name[file_name.find('dFC_clustering_')+15:file_name.find('.png')]}</h3>\n"
-                #             )
-                #             dFC_clustering_img = f"{dFC_clustering_dir}/{file_name}"
-                #             # get the original size of the image
-                #             img = plt.imread(dFC_clustering_img)
-                #             height, width, _ = img.shape
-                #             # change the width so that height equals img_height
-                #             width = int(width * img_height / height)
-                #             # replace the path to the image with a relative path
-                #             dFC_clustering_img = dFC_clustering_img.replace(subj_dir, ".")
-                #             file.write(
-                #                 f"<img src='{dFC_clustering_img}' alt='{file_name}' width='{width}' height='{img_height}'>\n"
-                #             )
-                #             file.write("<br>\n")
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()
@@ -1220,27 +1198,13 @@ def create_html_report_group_results(
                     file.write(f"<h3>{embedding}</h3>\n")
                     # display clustering ARI results
                     if run is None:
-                        clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{embedding}.png"
-                    else:
-                        clustering_img = f"{clustering_dir}/clustering_results_ARI_{task}_{run}_{embedding}.png"
-                    if os.path.exists(clustering_img):
-                        img = plt.imread(clustering_img)
-                        height, width, _ = img.shape
-                        # change the width so that height equals img_height
-                        width = int(width * img_height / height)
-                        # replace the path to the image with a relative path
-                        clustering_img = clustering_img.replace(group_dir, ".")
-                        file.write(
-                            f"<img src='{clustering_img}' alt='Clustering results' width='{width}' height='{img_height}'>\n"
+                        clustering_img = (
+                            f"{clustering_dir}/clustering_SI_{task}_{embedding}.png"
                         )
-
-                        file.write("<br>\n")
-
-                    # display clustering SI results
-                    if run is None:
-                        clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{embedding}.png"
                     else:
-                        clustering_img = f"{clustering_dir}/clustering_results_SI_{task}_{run}_{embedding}.png"
+                        clustering_img = (
+                            f"{clustering_dir}/clustering_SI_{task}_{run}_{embedding}.png"
+                        )
                     if os.path.exists(clustering_img):
                         img = plt.imread(clustering_img)
                         height, width, _ = img.shape
@@ -1254,58 +1218,6 @@ def create_html_report_group_results(
 
                         file.write("<br>\n")
 
-    # paradigm clustering results
-    file.write("<h1>Paradigm Clustering Results</h1>\n")
-    for session in SESSIONS:
-        if session is not None:
-            file.write(f"<h1> {session} </h1>\n")
-        if session is not None:
-            paradigm_clustering_dir = f"{group_dir}/paradigm_clustering/{session}"
-        else:
-            paradigm_clustering_dir = f"{group_dir}/paradigm_clustering"
-
-        # display paradigm clustering ARI scores
-        img_height = 300
-        file.write("<h2>Paradigm Clustering ARI Scores</h2>\n")
-        for embedding in ["PCA", "LE"]:
-            file.write(f"<h3>{embedding}</h3>\n")
-            paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_ARI_{embedding}.png"
-            try:
-                img = plt.imread(paradigm_clustering_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
-                file.write(
-                    f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
-                )
-            except Exception as e:
-                print(f"Error: {e}")
-
-            file.write("<br>\n")
-
-        # display paradigm clustering SI scores
-        img_height = 300
-        file.write("<h2>Paradigm Clustering SI Scores</h2>\n")
-        for embedding in ["PCA", "LE"]:
-            file.write(f"<h3>{embedding}</h3>\n")
-            paradigm_clustering_img = f"{paradigm_clustering_dir}/paradigm_clustering_results_SI_{embedding}.png"
-            try:
-                img = plt.imread(paradigm_clustering_img)
-                height, width, _ = img.shape
-                # change the width so that height equals img_height
-                width = int(width * img_height / height)
-                # replace the path to the image with a relative path
-                paradigm_clustering_img = paradigm_clustering_img.replace(group_dir, ".")
-                file.write(
-                    f"<img src='{paradigm_clustering_img}' alt='Paradigm clustering results' width='{width}' height='{img_height}'>\n"
-                )
-            except Exception as e:
-                print(f"Error: {e}")
-
-            file.write("<br>\n")
-
     # display visual clustering centroids
     img_height = 300
     file.write("<h1>Visual Clustering Centroids</h2>\n")
@@ -1413,36 +1325,6 @@ def create_html_report_group_results(
 
                     file.write("<br>\n")
 
-        # # display paradigm clustering centroids
-        # img_height = 300
-        # file.write("<h2>Paradigm Clustering Centroids</h2>\n")
-        # # find all png files in the directory
-        # paradigm_clustering_centroids_dir = f"{group_dir}/paradigm_clustering_centroids"
-        # for file_name in os.listdir(paradigm_clustering_centroids_dir):
-        #     if file_name.endswith(".png"):
-        #         measure_name = file_name[
-        #             file_name.find("Task_Paradigm_Centroids_") + 24 : -4
-        #         ]
-        #         file.write(f"<h3>{measure_name}</h3>\n")
-        #         paradigm_clustering_centroids_img = (
-        #             f"{paradigm_clustering_centroids_dir}/{file_name}"
-        #         )
-        #         # get the original size of the image
-        #         img = plt.imread(paradigm_clustering_centroids_img)
-        #         height, width, _ = img.shape
-        #         # change the width so that height equals img_height
-        #         width = int(width * img_height / height)
-        #         # replace the path to the image with a relative path
-        #         paradigm_clustering_centroids_img = (
-        #             paradigm_clustering_centroids_img.replace(group_dir, ".")
-        #         )
-        #         file.write(
-        #             f"<img src='{paradigm_clustering_centroids_img}' alt='Paradigm clustering centroids' width='{width}' height='{img_height}'>\n"
-        #         )
-        #         file.write("<br>\n")
-
-        # file.write("<br>\n")
-
     file.write("</body>\n")
     file.write("</html>\n")
     file.close()

From 5a7683eb2211f57a6621cbed052b8807000a7ead Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 26 May 2025 15:59:54 -0400
Subject: [PATCH 202/401] fix bug

---
 task_dFC/generate_report.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index f2a36a9..2cbdbe5 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -1433,21 +1433,21 @@ def create_html_report_group_results(
                     except Exception as e:
                         print(f"Error in plotting dFC matrices: {e}")
 
-                    try:
-                        plot_glm(
-                            fmriprep_root=fmriprep_root,
-                            roi_root=roi_root,
-                            subj=subj,
-                            task=task,
-                            bold_suffix=dataset_info["bold_suffix"],
-                            trial_type_label=dataset_info["trial_type_label"],
-                            rest_labels=dataset_info["rest_labels"],
-                            output_root=reports_root,
-                            run=run,
-                            session=session,
-                        )
-                    except Exception as e:
-                        print(f"Error in plotting GLM: {e}")
+                    # try:
+                    #     plot_glm(
+                    #         fmriprep_root=fmriprep_root,
+                    #         roi_root=roi_root,
+                    #         subj=subj,
+                    #         task=task,
+                    #         bold_suffix=dataset_info["bold_suffix"],
+                    #         trial_type_label=dataset_info["trial_type_label"],
+                    #         rest_labels=dataset_info["rest_labels"],
+                    #         output_root=reports_root,
+                    #         run=run,
+                    #         session=session,
+                    #     )
+                    # except Exception as e:
+                    #     print(f"Error in plotting GLM: {e}")
 
                     try:
                         plot_roi_signals(
@@ -1563,7 +1563,7 @@ def create_html_report_group_results(
                             task=task,
                             run=run,
                             session=session,
-                            ML_algorithm=["SVM", "Logistic regression"],
+                            ML_algorithms=["SVM", "Logistic regression"],
                             embedding=embedding,
                         )
                     except Exception as e:

From 806e81ed2b95091cd5c5091a204deeede9c05896 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 27 May 2025 17:25:31 -0400
Subject: [PATCH 203/401] refactor ml_utils

---
 pydfc/ml_utils.py | 412 +++++++++++++++++++++++++++++++++-------------
 task_dFC/ML.py    |   9 +-
 2 files changed, 299 insertions(+), 122 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 263491a..7fae9ee 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -11,6 +11,7 @@
 import numpy as np
 from scipy.spatial import procrustes
 from scipy.stats import zscore
+from sklearn.base import clone
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
@@ -27,7 +28,7 @@
     recall_score,
     silhouette_score,
 )
-from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
@@ -938,6 +939,43 @@ def embed_dFC_features(
 ################################# Classification Framework Functions ####################################
 
 
+def get_classification_results(
+    X_train,
+    X_test,
+    y_train,
+    y_test,
+    classifier_model=None,
+):
+    """
+    Get classification results for a given classifier.
+    This function fits the classifier, predicts the labels for train and test sets,
+    and calculates the balanced accuracy score, recall, precision, and f1 for both sets.
+
+    cloning ensures that the classifier is not fitted and the original classifier remains unchanged.
+    """
+    classifier_model = clone(classifier_model)
+    classifier_model.fit(X_train, y_train)
+    y_train_pred = classifier_model.predict(X_train)
+    y_test_pred = classifier_model.predict(X_test)
+
+    RESULT = {
+        "model": classifier_model,
+        "train": {
+            "balanced accuracy": balanced_accuracy_score(y_train, y_train_pred),
+            "recall": recall_score(y_train, y_train_pred),
+            "precision": precision_score(y_train, y_train_pred),
+            "f1": f1_score(y_train, y_train_pred),
+        },
+        "test": {
+            "balanced accuracy": balanced_accuracy_score(y_test, y_test_pred),
+            "recall": recall_score(y_test, y_test_pred),
+            "precision": precision_score(y_test, y_test_pred),
+            "f1": f1_score(y_test, y_test_pred),
+        },
+    }
+    return RESULT
+
+
 def logistic_regression_classify(X_train, y_train, X_test, y_test):
     """
     Logistic regression classification
@@ -955,17 +993,18 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
 
     C = lr_gscv.best_params_["logisticregression__C"]
 
-    log_reg = make_pipeline(
+    model = make_pipeline(
         StandardScaler(),
         LogisticRegression(penalty="l1", C=C, solver="saga"),
-    ).fit(X_train, y_train)
+    )
 
-    RESULT = {
-        "log_reg_model": log_reg,
-        "log_reg_C": C,
-        "log_reg_train_score": log_reg.score(X_train, y_train),
-        "log_reg_test_score": log_reg.score(X_test, y_test),
-    }
+    RESULT = get_classification_results(
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        classifier_model=model,
+    )
 
     return RESULT
 
@@ -990,17 +1029,19 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     C = model_gscv.best_params_["svc__C"]
     gamma = model_gscv.best_params_["svc__gamma"]
 
+    # this is for permutation tests
     model = make_pipeline(
         StandardScaler(),
         SVC(kernel="rbf", C=C, gamma=gamma),
-    ).fit(X_train, y_train)
+    )
 
-    RESULT = {
-        "SVM_cv_results": model_gscv.cv_results_,
-        "SVM_model": model,
-        "SVM_train_score": model.score(X_train, y_train),
-        "SVM_test_score": model.score(X_test, y_test),
-    }
+    RESULT = get_classification_results(
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        classifier_model=model,
+    )
     return RESULT
 
 
@@ -1022,17 +1063,19 @@ def KNN_classify(X_train, y_train, X_test, y_test):
 
     n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
 
-    neigh = make_pipeline(
+    # this is for permutation tests
+    model = make_pipeline(
         StandardScaler(),
         KNeighborsClassifier(n_neighbors=n_neighbors),
-    ).fit(X_train, y_train)
+    )
 
-    RESULT = {
-        "KNN_cv_results": knn_gscv.cv_results_,
-        "KNN_model": neigh,
-        "KNN_train_score": neigh.score(X_train, y_train),
-        "KNN_test_score": neigh.score(X_test, y_test),
-    }
+    RESULT = get_classification_results(
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        classifier_model=model,
+    )
 
     return RESULT
 
@@ -1059,17 +1102,19 @@ def random_forest_classify(X_train, y_train, X_test, y_test):
     n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"]
     max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"]
 
-    rf = make_pipeline(
+    # this is for permutation tests
+    model = make_pipeline(
         StandardScaler(),
         RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
-    ).fit(X_train, y_train)
+    )
 
-    RESULT = {
-        "RF_cv_results": rf_gscv.cv_results_,
-        "RF_model": rf,
-        "RF_train_score": rf.score(X_train, y_train),
-        "RF_test_score": rf.score(X_test, y_test),
-    }
+    RESULT = get_classification_results(
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        classifier_model=model,
+    )
 
     return RESULT
 
@@ -1098,23 +1143,108 @@ def gradient_boosting_classify(X_train, y_train, X_test, y_test):
     learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"]
     max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"]
 
-    gb = make_pipeline(
+    # this is for permutation tests
+    model = make_pipeline(
         StandardScaler(),
         GradientBoostingClassifier(
             n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate
         ),
-    ).fit(X_train, y_train)
+    )
 
-    RESULT = {
-        "GB_cv_results": gb_gscv.cv_results_,
-        "GB_model": gb,
-        "GB_train_score": gb.score(X_train, y_train),
-        "GB_test_score": gb.score(X_test, y_test),
-    }
+    RESULT = get_classification_results(
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        classifier_model=model,
+    )
 
     return RESULT
 
 
+def get_permutation_scores(
+    X_train,
+    y_train,
+    X_test,
+    y_test,
+    classifier_model,
+    n_permutations=100,
+):
+    """
+    Get permutation scores for a given classifier and data.
+    cloning ensures that the classifier is not previously fitted.
+    """
+    # first get the true balanced accuracy scores from original data
+    classifier_original = clone(classifier_model)
+    classifier_original.fit(X_train, y_train)
+    y_train_pred = classifier_original.predict(X_train)
+    y_test_pred = classifier_original.predict(X_test)
+
+    # next calculate the balanced accuracy scores for permuted data
+    permutation_train_scores = []
+    permutation_test_scores = []
+    for _ in range(n_permutations):
+        y_train_permuted = np.random.permutation(y_train)
+        model_permuted = clone(classifier_model)
+        model_permuted.fit(X_train, y_train_permuted)
+
+        y_train_permuted_pred = model_permuted.predict(X_train)
+        y_test_permuted_pred = model_permuted.predict(X_test)
+        permutation_train_scores.append(
+            balanced_accuracy_score(y_train_permuted, y_train_permuted_pred)
+        )
+        permutation_test_scores.append(
+            balanced_accuracy_score(y_test, y_test_permuted_pred)
+        )
+    p_value_train = (
+        np.sum(
+            np.array(permutation_train_scores)
+            >= balanced_accuracy_score(y_train, y_train_pred)
+        )
+        + 1
+    ) / (len(permutation_train_scores) + 1)
+    p_value_test = (
+        np.sum(
+            np.array(permutation_test_scores)
+            >= balanced_accuracy_score(y_test, y_test_pred)
+        )
+        + 1
+    ) / (len(permutation_test_scores) + 1)
+
+    return permutation_train_scores, permutation_test_scores, p_value_train, p_value_test
+
+
+def get_classification_scores(
+    target,
+    pred,
+):
+    """
+    Get classification scores for a given target and predicted labels.
+    Returns a dictionary with these metrics:
+    - accuracy
+    - balanced accuracy
+    - recall
+    - precision
+    - f1 score
+    - fp, fn, tp, tn
+    - average precision
+    """
+    tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
+    scores = {
+        "accuracy": accuracy_score(target, pred),
+        "balanced accuracy": balanced_accuracy_score(target, pred),
+        "recall": recall_score(target, pred),
+        "precision": precision_score(target, pred),
+        "f1": f1_score(target, pred),
+        "fp": fp,
+        "fn": fn,
+        "tp": tp,
+        "tn": tn,
+        "average precision": average_precision_score(target, pred),
+    }
+    return scores
+
+
 def task_presence_classification(
     task,
     dFC_id,
@@ -1167,15 +1297,24 @@ def task_presence_classification(
         )
     )
 
-    ML_RESULT = {"PCA": {}, "LE": {}}
     ML_scores = {
-        "subj_id": list(),
-        "group": list(),
-        "SI": list(),
-        "task": list(),
-        "run": list(),
-        "dFC method": list(),
-        "embedding": list(),
+        "group_lvl": {
+            "task": list(),
+            "run": list(),
+            "dFC method": list(),
+            "embedding": list(),
+            "group": list(),
+            "SI": list(),
+        },
+        "subj_lvl": {
+            "subj_id": list(),
+            "group": list(),
+            "SI": list(),
+            "task": list(),
+            "run": list(),
+            "dFC method": list(),
+            "embedding": list(),
+        },
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features
@@ -1197,6 +1336,12 @@ def task_presence_classification(
         except:
             continue
 
+        # Silhouette score
+        SI = {
+            "train": silhouette_score(X_train_embedded, y_train),
+            "test": silhouette_score(X_test_embedded, y_test),
+        }
+
         # task presence classification
 
         print("task presence classification ...")
@@ -1209,89 +1354,128 @@ def task_presence_classification(
         # SVM
         SVM_RESULT = SVM_classify(X_train_embedded, y_train, X_test_embedded, y_test)
 
-        # # KNN
-        # KNN_RESULT = KNN_classify(X_train_embedded, y_train, X_test_embedded, y_test)
+        ML_models = {"Logistic regression": log_reg_RESULT, "SVM": SVM_RESULT}
 
-        for key in log_reg_RESULT:
-            ML_RESULT[embedding][key] = log_reg_RESULT[key]
-        for key in SVM_RESULT:
-            ML_RESULT[embedding][key] = SVM_RESULT[key]
+        # permutation tests
+        permutation_scores = {
+            "train": {},
+            "test": {},
+        }
+        for model_name in ML_models:
+            (
+                permutation_train_scores,
+                permutation_test_scores,
+                p_value_train,
+                p_value_test,
+            ) = get_permutation_scores(
+                X_train=X_train_embedded,
+                y_train=y_train,
+                X_test=X_test_embedded,
+                y_test=y_test,
+                classifier_model=ML_models[model_name]["model"],
+                n_permutations=100,
+            )
+            permutation_scores["train"][
+                f"{model_name} permutation p_value"
+            ] = p_value_train
+            permutation_scores["train"][f"{model_name} permutation score mean"] = np.mean(
+                permutation_train_scores
+            )
+            permutation_scores["train"][f"{model_name} permutation score std"] = np.std(
+                permutation_train_scores
+            )
+            permutation_scores["test"][f"{model_name} permutation p_value"] = p_value_test
+            permutation_scores["test"][f"{model_name} permutation score mean"] = np.mean(
+                permutation_test_scores
+            )
+            permutation_scores["test"][f"{model_name} permutation score std"] = np.std(
+                permutation_test_scores
+            )
+
+        # group level scores
+        for group in ["train", "test"]:
 
-        # measure pred score on each subj
-        log_reg = log_reg_RESULT["log_reg_model"]
-        SVM = SVM_RESULT["SVM_model"]
+            ML_scores["group_lvl"]["group"].append(group)
+            ML_scores["group_lvl"]["embedding"].append(embedding)
+            ML_scores["group_lvl"]["task"].append(task)
+            ML_scores["group_lvl"]["run"].append(run)
+            ML_scores["group_lvl"]["dFC method"].append(measure_name)
+            # SI
+            ML_scores["group_lvl"]["SI"].append(SI[group])
+
+            for model_name in ML_models:
+                # accuracy score
+                for metric in ML_models[model_name][group]:
+                    if not f"{model_name} {metric}" in ML_scores["group_lvl"]:
+                        ML_scores["group_lvl"][f"{model_name} {metric}"] = list()
+                    ML_scores["group_lvl"][f"{model_name} {metric}"].append(
+                        ML_models[model_name][group][metric]
+                    )
 
-        ML_models = {"Logistic regression": log_reg, "SVM": SVM}
+                # permutation test results
+                for key in permutation_scores[group]:
+                    if not key in ML_scores["group_lvl"]:
+                        ML_scores["group_lvl"][key] = list()
+                    ML_scores["group_lvl"][key].append(permutation_scores[group][key])
 
+        # subject level scores
         for subj in SUBJECTS:
-            ML_scores["subj_id"].append(subj)
+            ML_scores["subj_lvl"]["subj_id"].append(subj)
             if subj in train_subjects:
-                ML_scores["group"].append("train")
+                ML_scores["subj_lvl"]["group"].append("train")
                 features = X_train_embedded[subj_label_train == subj, :]
                 target = y_train[subj_label_train == subj]
             elif subj in test_subjects:
-                ML_scores["group"].append("test")
+                ML_scores["subj_lvl"]["group"].append("test")
                 features = X_test_embedded[subj_label_test == subj, :]
                 target = y_test[subj_label_test == subj]
 
             # Silhouette score
-            ML_scores["SI"].append(silhouette_score(features, target))
+            ML_scores["subj_lvl"]["SI"].append(silhouette_score(features, target))
             # measure pred score using different metrics on each subj
-            for model_name, model in ML_models.items():
+            for model_name in ML_models:
+                model = ML_models[model_name]["model"]
                 pred = model.predict(features)
-                # accuracy score
-                if not f"{model_name} accuracy" in ML_scores:
-                    ML_scores[f"{model_name} accuracy"] = list()
-                ML_scores[f"{model_name} accuracy"].append(accuracy_score(target, pred))
-                # balanced accuracy score
-                if not f"{model_name} balanced accuracy" in ML_scores:
-                    ML_scores[f"{model_name} balanced accuracy"] = list()
-                ML_scores[f"{model_name} balanced accuracy"].append(
-                    balanced_accuracy_score(target, pred)
-                )
-                # precision score
-                if not f"{model_name} precision" in ML_scores:
-                    ML_scores[f"{model_name} precision"] = list()
-                ML_scores[f"{model_name} precision"].append(precision_score(target, pred))
-                # recall score
-                if not f"{model_name} recall" in ML_scores:
-                    ML_scores[f"{model_name} recall"] = list()
-                ML_scores[f"{model_name} recall"].append(recall_score(target, pred))
-                # f1 score
-                if not f"{model_name} f1" in ML_scores:
-                    ML_scores[f"{model_name} f1"] = list()
-                ML_scores[f"{model_name} f1"].append(f1_score(target, pred))
-                # confusion matrix
-                tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
-                # false positive rate
-                if not f"{model_name} fp" in ML_scores:
-                    ML_scores[f"{model_name} fp"] = list()
-                ML_scores[f"{model_name} fp"].append(fp)
-                # false negative rate
-                if not f"{model_name} fn" in ML_scores:
-                    ML_scores[f"{model_name} fn"] = list()
-                ML_scores[f"{model_name} fn"].append(fn)
-                # true positive rate
-                if not f"{model_name} tp" in ML_scores:
-                    ML_scores[f"{model_name} tp"] = list()
-                ML_scores[f"{model_name} tp"].append(tp)
-                # true negative rate
-                if not f"{model_name} tn" in ML_scores:
-                    ML_scores[f"{model_name} tn"] = list()
-                ML_scores[f"{model_name} tn"].append(tn)
-                # average precision score
-                if not f"{model_name} average precision" in ML_scores:
-                    ML_scores[f"{model_name} average precision"] = list()
-                ML_scores[f"{model_name} average precision"].append(
-                    average_precision_score(target, pred)
-                )
+                scores = get_classification_scores(target=target, pred=pred)
+
+                for metric in scores:
+                    if not f"{model_name} {metric}" in ML_scores["subj_lvl"]:
+                        ML_scores["subj_lvl"][f"{model_name} {metric}"] = list()
+                    ML_scores["subj_lvl"][f"{model_name} {metric}"].append(scores[metric])
+
+            ML_scores["subj_lvl"]["task"].append(task)
+            ML_scores["subj_lvl"]["run"].append(run)
+            ML_scores["subj_lvl"]["dFC method"].append(measure_name)
+            ML_scores["subj_lvl"]["embedding"].append(embedding)
+
+    # sanity check of the ML_scores
+    L = None
+    for key in ML_scores["group_lvl"]:
+        if L is None:
+            L = len(ML_scores["group_lvl"][key])
+        else:
+            assert (
+                len(ML_scores["group_lvl"][key]) == L
+            ), f"Length of {key} is not equal to others."
 
-            ML_scores["task"].append(task)
-            ML_scores["run"].append(run)
-            ML_scores["dFC method"].append(measure_name)
-            ML_scores["embedding"].append(embedding)
+    # L is supposed to be equal to 2 embeddings (PCA and LE) * 2 groups (train and test)
+    assert L == 2 * 2, f"Length of group_lvl is not equal to 4, but {L}."
+
+    L = None
+    for key in ML_scores["subj_lvl"]:
+        if L is None:
+            L = len(ML_scores["subj_lvl"][key])
+        else:
+            assert (
+                len(ML_scores["subj_lvl"][key]) == L
+            ), f"Length of {key} is not equal to others."
+
+    # L is supposed to be equal to number of subjects * 2 embeddings (PCA and LE)
+    assert (
+        L == len(SUBJECTS) * 2
+    ), f"Length of subj_lvl is not equal to {len(SUBJECTS) * 2}, but {L}."
 
-    return ML_RESULT, ML_scores
+    return ML_scores
 
 
 ################################# Clustering Framework Functions ####################################
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 9617db8..6ce5ff4 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -79,12 +79,10 @@ def run_classification(
             print(f"=================== {session} ===================")
 
         ML_scores = {}
-        ML_RESULT = {}
         for task_id, task in enumerate(TASKS):
-            ML_RESULT[task] = {}
             for run in RUNS[task]:
                 try:
-                    ML_RESULT_new, ML_scores_new = task_presence_classification(
+                    ML_scores_new = task_presence_classification(
                         task=task,
                         dFC_id=dFC_id,
                         roi_root=roi_root,
@@ -94,10 +92,6 @@ def run_classification(
                         dynamic_pred=dynamic_pred,
                         normalize_dFC=normalize_dFC,
                     )
-                    if run is None:
-                        ML_RESULT[task] = ML_RESULT_new
-                    else:
-                        ML_RESULT[task][run] = ML_RESULT_new
                     for key in ML_scores_new:
                         if key not in ML_scores:
                             ML_scores[key] = list()
@@ -117,7 +111,6 @@ def run_classification(
                 os.makedirs(folder)
         except OSError as err:
             print(err)
-        np.save(f"{folder}/ML_RESULT_{dFC_id}.npy", ML_RESULT)
 
         np.save(f"{folder}/ML_scores_classify_{dFC_id}.npy", ML_scores)
 

From 2759e265bf897449530802dd16826d8beac0c1c9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 27 May 2025 17:30:24 -0400
Subject: [PATCH 204/401] minor

---
 task_dFC/ML.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 6ce5ff4..54f57f5 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -78,7 +78,10 @@ def run_classification(
         if not session is None:
             print(f"=================== {session} ===================")
 
-        ML_scores = {}
+        ML_scores = {
+            "group_lvl": {},
+            "subj_lvl": {},
+        }
         for task_id, task in enumerate(TASKS):
             for run in RUNS[task]:
                 try:
@@ -92,10 +95,19 @@ def run_classification(
                         dynamic_pred=dynamic_pred,
                         normalize_dFC=normalize_dFC,
                     )
-                    for key in ML_scores_new:
-                        if key not in ML_scores:
-                            ML_scores[key] = list()
-                        ML_scores[key].extend(ML_scores_new[key])
+                    # group level scores
+                    for key in ML_scores_new["group_lvl"]:
+                        if key not in ML_scores["group_lvl"]:
+                            ML_scores["group_lvl"][key] = list()
+                        ML_scores["group_lvl"][key].extend(
+                            ML_scores_new["group_lvl"][key]
+                        )
+                    # subject level scores
+                    for key in ML_scores_new["subj_lvl"]:
+                        if key not in ML_scores["subj_lvl"]:
+                            ML_scores["subj_lvl"][key] = list()
+                        ML_scores["subj_lvl"][key].extend(ML_scores_new["subj_lvl"][key])
+
                 except Exception as e:
                     print(
                         f"Error in task presence classification for {session} {task} {run}: {e}"

From 7d3b3e56059ab72c5b8a7ac6e9918b873f1f612b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 27 May 2025 17:38:24 -0400
Subject: [PATCH 205/401] update report

---
 task_dFC/generate_report.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 2cbdbe5..f71bcc0 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -617,6 +617,7 @@ def plot_ML_results(
     ML_scores = None
     for score_file in ALL_ML_SCORES:
         ML_scores_new = np.load(f"{input_dir}/{score_file}", allow_pickle="TRUE").item()
+        ML_scores_new = ML_scores_new["subj_lvl"]
         if ML_scores is None:
             ML_scores = ML_scores_new
         else:

From 6faf5673d478ff0acac4f7440a96e2e2c83f8851 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 28 May 2025 13:29:03 -0400
Subject: [PATCH 206/401] group_permutation

---
 pydfc/ml_utils.py | 54 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 7fae9ee..19b8ab4 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1029,7 +1029,6 @@ def SVM_classify(X_train, y_train, X_test, y_test):
     C = model_gscv.best_params_["svc__C"]
     gamma = model_gscv.best_params_["svc__gamma"]
 
-    # this is for permutation tests
     model = make_pipeline(
         StandardScaler(),
         SVC(kernel="rbf", C=C, gamma=gamma),
@@ -1063,7 +1062,6 @@ def KNN_classify(X_train, y_train, X_test, y_test):
 
     n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
 
-    # this is for permutation tests
     model = make_pipeline(
         StandardScaler(),
         KNeighborsClassifier(n_neighbors=n_neighbors),
@@ -1102,7 +1100,6 @@ def random_forest_classify(X_train, y_train, X_test, y_test):
     n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"]
     max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"]
 
-    # this is for permutation tests
     model = make_pipeline(
         StandardScaler(),
         RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
@@ -1143,7 +1140,6 @@ def gradient_boosting_classify(X_train, y_train, X_test, y_test):
     learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"]
     max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"]
 
-    # this is for permutation tests
     model = make_pipeline(
         StandardScaler(),
         GradientBoostingClassifier(
@@ -1162,12 +1158,55 @@ def gradient_boosting_classify(X_train, y_train, X_test, y_test):
     return RESULT
 
 
+def group_permutation(y, groups, permute_groups=True):
+    """
+    Permute the labels while keeping the group structure intact.
+    This is useful for permutation tests where we want to keep the group structure.
+    Also permute the order of groups if permute_groups is True.
+    If permute_groups is False, the labels within each group are permuted but the order of groups is not changed.
+    This function assumes that all samples in a group have the same label.
+    """
+    # make sure groups is a numpy array
+    groups = np.array(groups, copy=True)
+    y = np.copy(y)
+
+    unique_groups = np.unique(groups)
+
+    # Step 1: Create a mapping from groups to labels
+    group_to_label = {group: y[groups == group] for group in unique_groups}
+
+    # Step 2: Permute each group labels
+    group_to_permuted_label = {}
+    for group in unique_groups:
+        group_to_permuted_label[group] = np.random.permutation(group_to_label[group])
+
+    # Step 3: Reconstruct permuted y based on groups
+    # also shuffle the order of groups if permute_groups is True
+    if permute_groups:
+        unique_groups_permuted = np.random.permutation(unique_groups)
+    else:
+        unique_groups_permuted = unique_groups
+    y_permuted = list()
+    for group in unique_groups_permuted:
+        # For each group, append the permuted label to y_permuted
+        y_permuted.extend(group_to_permuted_label[group])
+    # Convert to numpy array
+    y_permuted = np.array(y_permuted)
+
+    assert (
+        y_permuted.shape == y.shape
+    ), f"Permuted labels shape {y_permuted.shape} does not match original labels shape {y.shape}"
+
+    return y_permuted
+
+
 def get_permutation_scores(
     X_train,
     y_train,
     X_test,
     y_test,
     classifier_model,
+    groups_train=None,
     n_permutations=100,
 ):
     """
@@ -1184,7 +1223,11 @@ def get_permutation_scores(
     permutation_train_scores = []
     permutation_test_scores = []
     for _ in range(n_permutations):
-        y_train_permuted = np.random.permutation(y_train)
+        if groups_train is not None:
+            # permute the labels while keeping the group structure intact
+            y_train_permuted = group_permutation(y_train, groups_train)
+        else:
+            y_train_permuted = np.random.permutation(y_train)
         model_permuted = clone(classifier_model)
         model_permuted.fit(X_train, y_train_permuted)
 
@@ -1373,6 +1416,7 @@ def task_presence_classification(
                 X_test=X_test_embedded,
                 y_test=y_test,
                 classifier_model=ML_models[model_name]["model"],
+                groups_train=subj_label_train,
                 n_permutations=100,
             )
             permutation_scores["train"][

From cbed638f22c1eca8d4b7e39d508ed5ebe5e85b69 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 28 May 2025 13:43:24 -0400
Subject: [PATCH 207/401] change cv of logreg and svm to StratifiedGroupKFold

---
 pydfc/ml_utils.py | 65 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 19b8ab4..f087703 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -28,11 +28,12 @@
     recall_score,
     silhouette_score,
 )
-from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold, StratifiedKFold
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
+from sklearn.utils import shuffle
 
 from .dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm
 from .task_utils import (
@@ -976,9 +977,12 @@ def get_classification_results(
     return RESULT
 
 
-def logistic_regression_classify(X_train, y_train, X_test, y_test):
+def logistic_regression_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
     """
     Logistic regression classification
+
+    provide subj_label_train if you want to use StratifiedGroupKFold
+    to ensure that the same subject is not in both train and test sets
     """
     # create a pipeline with a logistic regression model to find the best C
     logistic_reg = make_pipeline(
@@ -986,10 +990,25 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
     )
     # create a dictionary of all values we want to test for C
     param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+
+    # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
+    # shuffle the data to ensure time points are shuffled
+    if subj_label_train is None:
+        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
+        cv = StratifiedKFold(n_splits=5)
+    else:
+        X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
+            X_train, y_train, subj_label_train
+        )
+        cv = StratifiedGroupKFold(n_splits=5)
     # use gridsearch to test all values for C
-    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=5)
+    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=cv, n_jobs=-1)
     # fit model to data
-    lr_gscv.fit(X_train, y_train)
+    if subj_label_train is None:
+        lr_gscv.fit(X_train_shuffled, y_train_shuffled)
+    else:
+        # use groups to ensure that the same subject is not in both train and test sets
+        lr_gscv.fit(X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled)
 
     C = lr_gscv.best_params_["logisticregression__C"]
 
@@ -1009,9 +1028,12 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test):
     return RESULT
 
 
-def SVM_classify(X_train, y_train, X_test, y_test):
+def SVM_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
     """
     SVM classification
+
+    provide subj_label_train if you want to use StratifiedGroupKFold
+    to ensure that the same subject is not in both train and test sets
     """
     # define the parameter grid
     param_grid = {
@@ -1024,8 +1046,23 @@ def SVM_classify(X_train, y_train, X_test, y_test):
         StandardScaler(),
         SVC(kernel="rbf"),
     )
-    model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=3, n_jobs=-1)
-    model_gscv.fit(X_train, y_train)
+    # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
+    # shuffle the data to ensure time points are shuffled
+    if subj_label_train is None:
+        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
+        cv = StratifiedKFold(n_splits=3)
+    else:
+        X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
+            X_train, y_train, subj_label_train
+        )
+        cv = StratifiedGroupKFold(n_splits=3)
+    model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=cv, n_jobs=-1)
+    if subj_label_train is None:
+        model_gscv.fit(X_train_shuffled, y_train_shuffled)
+    else:
+        model_gscv.fit(
+            X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled
+        )
     C = model_gscv.best_params_["svc__C"]
     gamma = model_gscv.best_params_["svc__gamma"]
 
@@ -1391,11 +1428,21 @@ def task_presence_classification(
 
         # logistic regression
         log_reg_RESULT = logistic_regression_classify(
-            X_train_embedded, y_train, X_test_embedded, y_test
+            X_train=X_train_embedded,
+            y_train=y_train,
+            X_test=X_test_embedded,
+            y_test=y_test,
+            subj_label_train=subj_label_train,
         )
 
         # SVM
-        SVM_RESULT = SVM_classify(X_train_embedded, y_train, X_test_embedded, y_test)
+        SVM_RESULT = SVM_classify(
+            X_train=X_train_embedded,
+            y_train=y_train,
+            X_test=X_test_embedded,
+            y_test=y_test,
+            subj_label_train=subj_label_train,
+        )
 
         ML_models = {"Logistic regression": log_reg_RESULT, "SVM": SVM_RESULT}
 

From 67535fccd63a3ae4124aa50659ce7413ca1ebb66 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 31 May 2025 17:49:31 -0400
Subject: [PATCH 208/401] minor

---
 task_dFC/run_scripts_slurm/run_ML.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index c0363c9..7187c62 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -3,7 +3,6 @@
 #SBATCH --job-name=ML_job   # Optional: Name of your job
 #SBATCH --output=logs/ML_out.txt  # Standard output log
 #SBATCH --error=logs/ML_err.txt   # Standard error log
-#SBATCH --time=4-00:00:00                # Walltime for each task (4 days)
 #SBATCH --mem=128G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"

From ba93c1d76e289382c647753b0ec240760e9034b2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 2 Jun 2025 11:52:54 -0400
Subject: [PATCH 209/401] fix bug in ml_utils

---
 pydfc/ml_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index f087703..cad9abf 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1503,11 +1503,11 @@ def task_presence_classification(
                         ML_models[model_name][group][metric]
                     )
 
-                # permutation test results
-                for key in permutation_scores[group]:
-                    if not key in ML_scores["group_lvl"]:
-                        ML_scores["group_lvl"][key] = list()
-                    ML_scores["group_lvl"][key].append(permutation_scores[group][key])
+            # permutation test results
+            for key in permutation_scores[group]:
+                if not key in ML_scores["group_lvl"]:
+                    ML_scores["group_lvl"][key] = list()
+                ML_scores["group_lvl"][key].append(permutation_scores[group][key])
 
         # subject level scores
         for subj in SUBJECTS:

From 297ee3016ee45334848994c6c4f51b70286aba38 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Jun 2025 01:25:02 -0400
Subject: [PATCH 210/401] refactor simul_utils

---
 pydfc/simul_utils.py             | 209 ++++++++++++++++++++-----------
 simul_dFC/task_data_simulator.py |   2 +-
 2 files changed, 137 insertions(+), 74 deletions(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index 79a9cd6..7fd197b 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -7,6 +7,7 @@
 """
 
 import numpy as np
+from scipy import signal
 from tvb.simulator.lab import *
 
 from pydfc import TIME_SERIES, task_utils
@@ -14,6 +15,56 @@
 ################################# Simulation Functions ####################################
 
 
+class CustomStimuli(patterns.StimuliRegion):
+    def __init__(
+        self, stimulus_timing, region_weighting, connectivity, amplitude=1.0, **kwargs
+    ):
+        """
+        Parameters:
+        - stimulus_timing: array of 0s and 1s (or amplitudes) over time
+        - target_nodes: list or array of node indices where to apply the stimulus
+        - amplitude: default amplitude (multiplied by stimulus_timing value)
+        """
+        super().__init__(**kwargs)
+        self.stimulus_timing = np.array(stimulus_timing)
+        self.amplitude = amplitude
+        self.current_idx = 0
+        self.weight = region_weighting
+        self.connectivity = connectivity  # Required by TVB, even if not used
+        # Required by TVB, even if not used
+        self.temporal = equations.PulseTrain()
+        self.spatial = equations.DiscreteEquation()
+
+    def __call__(self, temporal_indices, spatial_indices=None):
+
+        # if temporal_indices is not a single integer, raise an error
+        if not isinstance(temporal_indices, (int, np.integer)):
+            raise ValueError(
+                "CustomStimuli expects a single integer for temporal_indices."
+            )
+        # time is milliseconds
+        n_nodes = self.weight.shape[0]
+        stim = np.zeros(n_nodes)
+
+        # Determine which index in the stimulus array corresponds to current time
+        self.current_idx = temporal_indices
+
+        if self.current_idx < len(self.stimulus_timing):
+            stim_value = self.stimulus_timing[self.current_idx] * self.amplitude
+        else:
+            stim_value = 0  # stimulus ends when array is exhausted
+
+        stim = np.multiply(self.weight, stim_value)
+        self.stimulus = stim
+        return self.stimulus
+
+    def set_state(self, state):
+        self.state = state
+
+    def configure_time(self, t):
+        pass
+
+
 def create_random_stimulus_weights(stimulated_regions_list, n_regions=76):
     """
     Create random stimulus weights for the stimulated regions.
@@ -30,33 +81,8 @@ def create_random_stimulus_weights(stimulated_regions_list, n_regions=76):
     return weighting
 
 
-def create_stimulus(
-    onset,
-    task_duration,
-    task_block_duration,
-    conn,
-    region_weighting,
-):
-    """
-    Create a stimulus pattern for the task.
-    """
-    # temporal profile
-    eqn_t = equations.PulseTrain()
-    eqn_t.parameters["onset"] = onset * 1e3  # ms
-    eqn_t.parameters["tau"] = task_duration * 1e3  # ms
-    eqn_t.parameters["T"] = task_block_duration * 1e3  # ms
-
-    stimulus = patterns.StimuliRegion(
-        temporal=eqn_t, connectivity=conn, weight=region_weighting
-    )
-
-    return stimulus
-
-
 def simulate_task_BOLD(
-    onset_time,
-    task_duration,
-    task_block_duration,
+    stimulus_timing,
     sim_length,
     BOLD_period,
     TAVG_period,
@@ -72,18 +98,14 @@ def simulate_task_BOLD(
 
     Parameters
     ----------
-    onset_time : float
-        The onset time of the task.
-    task_duration : float
-        The duration of the task.
-    task_block_duration : float
-        The duration of the task block.
+    stimulus_timing : array-like, optional
+        The stimulus timing array, which should contain 0s and 1s (or amplitudes) over time.
     sim_length : float
-        The length of the simulation.
+        The length of the simulation in seconds.
     BOLD_period : float
-        The BOLD period.
+        The BOLD period in seconds.
     TAVG_period : float
-        The TAVG period.
+        The TAVG period in seconds.
     num_stimulated_regions : int, optional
         The number of stimulated regions. The default is 5.
         if num_stimulated_regions is 5, the stimulated regions are:
@@ -95,7 +117,6 @@ def simulate_task_BOLD(
         else, the stimulated regions are randomly selected.
     """
     # randomize some parameters for each subjects
-    onset = np.random.normal(loc=onset_time, scale=0.5)  # seconds
     global_conn_coupling = np.random.normal(loc=global_conn_coupling_coef, scale=0.0075)
     conn_speed_rand = np.random.normal(loc=conn_speed, scale=0.1 * conn_speed)
     ################################# Initialize Simulation ####################################
@@ -118,12 +139,15 @@ def simulate_task_BOLD(
         stimulated_regions_list=stimulated_regions_list, n_regions=76
     )
 
-    stimulus = create_stimulus(
-        onset=onset,
-        task_duration=task_duration,
-        task_block_duration=task_block_duration,
-        conn=conn,
+    # check if stimulus_timing is only containing 0s and 1s
+    if not np.all(np.isin(stimulus_timing, [0, 1])):
+        raise ValueError("stimulus_timing should only contain 0s and 1s.")
+
+    stimulus = CustomStimuli(
+        stimulus_timing=stimulus_timing,
         region_weighting=weighting,
+        connectivity=conn,
+        amplitude=1.0,
     )
 
     ################################# Run Simulation ####################################
@@ -185,7 +209,6 @@ def simulate_task_BOLD(
 
 
 def create_simul_task_info(
-    num_time_mri,
     TR_mri,
     task,
     onset,
@@ -199,10 +222,8 @@ def create_simul_task_info(
 
     Parameters
     ----------
-    num_time_mri : int
-        Number of time points in the BOLD signal.
     TR_mri : float
-        The repetition time of the MRI.
+        The repetition time of the MRI in seconds.
     task : str
         The task name.
     onset : float
@@ -213,6 +234,7 @@ def create_simul_task_info(
         The duration of the task block.
     sim_length : float
         The length of the simulation.
+        in milliseconds
     oversampling : int, optional
         The oversampling factor. The default is 50.
         generate more samples per TR than the func data to have a
@@ -224,11 +246,15 @@ def create_simul_task_info(
     # using onset, task_duration, task_block_duration to create the events
     events.append(["onset", "duration", "trial_type"])
     t = onset
-    while t < sim_length:
+    while t < (sim_length * 1e-3):
         events.append([t, task_duration, "task"])
         t += task_block_duration
     events = np.array(events)
 
+    # find the number of time points in the MRI data
+    # sim_length is in milliseconds
+    num_time_mri = int((sim_length * 1e-3) / TR_mri)
+
     event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
         events=events,
         TR_mri=TR_mri,
@@ -254,14 +280,39 @@ def create_simul_task_info(
     return task_data
 
 
+def event_labels_to_stimulus_timing(
+    event_labels,
+    Fs_task,
+    dt,
+):
+    """
+    Convert event labels to stimulus timing.
+    Parameters
+    ----------
+    event_labels : array-like
+        The event labels, which should contain 0s (rest) and event ids over time.
+    Fs_task : float
+        The sampling frequency of the task data in Hz.
+    dt : float
+        The simulation time step in milliseconds.
+    """
+    # make sure the timings are only 0s and 1s
+    stimulus_timing = np.multiply(event_labels != 0, 1)
+
+    # make sure task_data sampling frequency is equal to simulation time step
+    L_old = len(stimulus_timing)
+    L_new = int((L_old * 1e3) / (Fs_task * dt))
+    stimulus_timing = signal.resample(stimulus_timing, L_new)
+    # binarize the stimulus timing
+    # because of the resampling, the values might not be exactly 0 or 1
+    stimulus_timing = np.where(stimulus_timing > 0.5, 1, 0)
+
+    return stimulus_timing
+
+
 def simulate_task_BOLD_TS(
     subj_id,
-    task,
-    onset_time,
-    task_duration,
-    task_block_duration,
-    sim_length,
-    BOLD_period,
+    task_data,
     TAVG_period,
     num_stimulated_regions=5,
     global_conn_coupling_coef=0.0126,
@@ -273,11 +324,18 @@ def simulate_task_BOLD_TS(
     """
     Simulate BOLD signal for a task and return a TIME_SERIES object.
     """
+    task = task_data["task"]
+    BOLD_period = task_data["TR_mri"] * 1e3  # convert to milliseconds
+    sim_length = task_data["num_time_mri"] * BOLD_period  # in milliseconds
+    stimulus_timing = event_labels_to_stimulus_timing(
+        event_labels=task_data["event_labels"],
+        Fs_task=task_data["Fs_task"],
+        dt=dt,
+    )
+
     bold_data, bold_time, region_labels, centres_locs, TR_mri, _, _, _ = (
         simulate_task_BOLD(
-            onset_time=onset_time,
-            task_duration=task_duration,
-            task_block_duration=task_block_duration,
+            stimulus_timing=stimulus_timing,
             sim_length=sim_length,
             BOLD_period=BOLD_period,
             TAVG_period=TAVG_period,
@@ -298,18 +356,8 @@ def simulate_task_BOLD_TS(
         TS_name=f"BOLD_{subj_id}_{task}",
         session_name=task,
     )
-    num_time_mri = time_series.n_time
-    task_data = create_simul_task_info(
-        num_time_mri=num_time_mri,
-        TR_mri=TR_mri,
-        task=task,
-        onset=onset_time,
-        task_duration=task_duration,
-        task_block_duration=task_block_duration,
-        sim_length=sim_length,
-    )
 
-    return time_series, task_data
+    return time_series
 
 
 def simulate_task_data(subj_id, task_info):
@@ -324,6 +372,10 @@ def simulate_task_data(subj_id, task_info):
         A dictionary containing the task information below:
             - task_name: str
                 The name of the task.
+            - task_data: dict
+                A dictionary containing the task parameters
+                if task_data is not provided, onset_time, task_duration, task_block_duration,
+                sim_length, will be used to create the task data.
             - onset_time: float
                 The onset time of the task.
             - task_duration: float
@@ -347,14 +399,21 @@ def simulate_task_data(subj_id, task_info):
             - dt: float
                 The simulation time step.
     """
-    time_series, task_data = simulate_task_BOLD_TS(
+    if task_info["task_data"] is not None:
+        task_data = task_info["task_data"]
+    else:
+        task_data = create_simul_task_info(
+            TR_mri=task_info["BOLD_period"] * 1e-3,  # convert to seconds
+            task=task_info["task_name"],
+            onset=task_info["onset_time"],
+            task_duration=task_info["task_duration"],
+            task_block_duration=task_info["task_block_duration"],
+            sim_length=task_info["sim_length"],
+        )
+
+    time_series = simulate_task_BOLD_TS(
         subj_id=subj_id,
-        task=task_info["task_name"],
-        onset_time=task_info["onset_time"],
-        task_duration=task_info["task_duration"],
-        task_block_duration=task_info["task_block_duration"],
-        sim_length=task_info["sim_length"],
-        BOLD_period=task_info["BOLD_period"],
+        task_data=task_data,
         TAVG_period=task_info["TAVG_period"],
         num_stimulated_regions=task_info["num_stimulated_regions"],
         global_conn_coupling_coef=task_info["global_conn_coupling_coef"],
@@ -363,4 +422,8 @@ def simulate_task_data(subj_id, task_info):
         dt=task_info["dt"],
     )
 
+    # make sure task_data["num_time_mri"] is equal to the number of time points in the time series
+    if task_data["num_time_mri"] != time_series.n_time:
+        task_data["num_time_mri"] = time_series.n_time
+
     return time_series, task_data
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 15d43d7..f3312a2 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -29,7 +29,7 @@
 global_conn_coupling_coef = 0.0126
 conn_speed = 1.0
 D = 0.001  # noise dispersion
-dt = 0.5  # integration step
+dt = 0.5  # integration step in m sec
 n_subj = 200  # number of subjects
 
 # argparse

From 2f9a5a7b26dd570d01b3d044cc7b8100431fe4b2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Jun 2025 12:38:02 -0400
Subject: [PATCH 211/401] task_data_simulator reads tasks_info from json

---
 pydfc/simul_utils.py                          |  31 ++--
 simul_dFC/run_scripts_slurm/run_simulator.sh  |   9 +-
 .../tasks_info_ds003465.json                  |  42 +++++
 .../tasks_info_pulseTrain.json                | 100 +++++++++++
 simul_dFC/task_data_simulator.py              | 161 ++++--------------
 5 files changed, 198 insertions(+), 145 deletions(-)
 create mode 100644 simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
 create mode 100644 simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index 7fd197b..0a00a48 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -103,9 +103,9 @@ def simulate_task_BOLD(
     sim_length : float
         The length of the simulation in seconds.
     BOLD_period : float
-        The BOLD period in seconds.
+        The BOLD period in milliseconds.
     TAVG_period : float
-        The TAVG period in seconds.
+        The TAVG period in milliseconds.
     num_stimulated_regions : int, optional
         The number of stimulated regions. The default is 5.
         if num_stimulated_regions is 5, the stimulated regions are:
@@ -366,28 +366,28 @@ def simulate_task_data(subj_id, task_info):
 
     Parameters
     ----------
-    subj_id : int
+    subj_id : str
         The subject ID.
     task_info : dict
         A dictionary containing the task information below:
             - task_name: str
                 The name of the task.
-            - task_data: dict
-                A dictionary containing the task parameters
+            - task_data: str
+                Path to a dictionary containing the task parameters
                 if task_data is not provided, onset_time, task_duration, task_block_duration,
                 sim_length, will be used to create the task data.
             - onset_time: float
-                The onset time of the task.
+                The onset time of the task in seconds.
             - task_duration: float
-                The duration of the task.
+                The duration of the task in seconds.
             - task_block_duration: float
-                The duration of the task block.
+                The duration of the task block in seconds.
             - sim_length: float
-                The length of the simulation.
+                The length of the simulation in milliseconds.
             - BOLD_period: float
-                The BOLD period.
+                The BOLD period in milliseconds.
             - TAVG_period: float
-                The TAVG period.
+                The TAVG period in milliseconds.
             - num_stimulated_regions: int
                 The number of stimulated regions.
             - global_conn_coupling_coef: float
@@ -397,10 +397,15 @@ def simulate_task_data(subj_id, task_info):
             - conn_speed: float
                 The connectivity speed.
             - dt: float
-                The simulation time step.
+                The simulation time step in milliseconds.
     """
     if task_info["task_data"] is not None:
-        task_data = task_info["task_data"]
+        # task_info["task_data"] is a path to a dictionary with {subj_id} as a placeholder
+        if "{subj_id}" in task_info["task_data"]:
+            task_data_path = task_info["task_data"].replace("{subj_id}", subj_id)
+        else:
+            task_data_path = task_info["task_data"]
+        task_data = np.load(task_data_path, allow_pickle="TRUE").item()
     else:
         task_data = create_simul_task_info(
             TR_mri=task_info["BOLD_period"] * 1e-3,  # convert to seconds
diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
index 21e669e..ccb4065 100644
--- a/simul_dFC/run_scripts_slurm/run_simulator.sh
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -8,14 +8,21 @@
 #SBATCH --mem=8G                     # Memory request per node
 #SBATCH --array=1-200                # Task array specification
 
+SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
+TASKS_INFO="./tasks_info.json"
+
+SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
 
 # Activate  virtual environment
 source "/home/mt00/venvs/pydfc/bin/activate"
 
 # Run Python script
 python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" \
---dataset_info $DATASET_INFO
+--dataset_info $DATASET_INFO \
+--tasks_info $TASKS_INFO \
+--participant_id $SUBJECT_ID
 
 # Deactivate environment
 deactivate
diff --git a/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json b/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
new file mode 100644
index 0000000..009e173
--- /dev/null
+++ b/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
@@ -0,0 +1,42 @@
+{
+    "task-Axcpt": {
+        "task_name": "task-Axcpt",
+        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Axcpt_run-1_task-data.npy",
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-Cuedts": {
+        "task_name": "task-Cuedts",
+        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Cuedts_run-1_task-data.npy",
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-Stern": {
+        "task_name": "task-Stern",
+        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Stern_run-1_task-data.npy",
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-Stroop": {
+        "task_name": "task-Stroop",
+        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Stroop_run-1_task-data.npy",
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    }
+}
diff --git a/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json b/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json
new file mode 100644
index 0000000..41967c2
--- /dev/null
+++ b/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json
@@ -0,0 +1,100 @@
+{
+    "task-lowFreqLongRest": {
+        "task_name": "task-lowFreqLongRest",
+        "onset_time": 20.0,
+        "task_duration": 8.0,
+        "task_block_duration": 20.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-lowFreqShortRest": {
+        "task_name": "task-lowFreqShortRest",
+        "onset_time": 20.0,
+        "task_duration": 12.0,
+        "task_block_duration": 20.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-lowFreqShortTask": {
+        "task_name": "task-lowFreqShortTask",
+        "onset_time": 20.0,
+        "task_duration": 1.0,
+        "task_block_duration": 20.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-highFreqLongRest": {
+        "task_name": "task-highFreqLongRest",
+        "onset_time": 20.0,
+        "task_duration": 1.0,
+        "task_block_duration": 5.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-highFreqShortRest": {
+        "task_name": "task-highFreqShortRest",
+        "onset_time": 20.0,
+        "task_duration": 4.0,
+        "task_block_duration": 5.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-lowFreqShortRestDominStimul": {
+        "task_name": "task-lowFreqShortRestDominStimul",
+        "onset_time": 20.0,
+        "task_duration": 12.0,
+        "task_block_duration": 20.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 26,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 0.01,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    },
+    "task-lowFreqShortRestNoisy": {
+        "task_name": "task-midFreqMidRestNoisy",
+        "onset_time": 20.0,
+        "task_duration": 12.0,
+        "task_block_duration": 20.0,
+        "sim_length": 250e3,
+        "BOLD_period": 500,
+        "TAVG_period": 1.0,
+        "num_stimulated_regions": 5,
+        "global_conn_coupling_coef": 0.0126,
+        "D": 1.00,
+        "conn_speed": 1.0,
+        "dt": 0.5
+    }
+}
diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index f3312a2..17059f9 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -21,17 +21,6 @@
 os.environ["OMP_NUM_THREADS"] = "16"
 ################################# Parameters ####################################
 
-# simulation parameters
-sim_length = 250e3  # in m sec
-onset_time = 20.0  # in seconds
-BOLD_period = 500  # in m sec
-TAVG_period = 1.0  # in m sec
-global_conn_coupling_coef = 0.0126
-conn_speed = 1.0
-D = 0.001  # noise dispersion
-dt = 0.5  # integration step in m sec
-n_subj = 200  # number of subjects
-
 # argparse
 HELPTEXT = """
 Script to simulate task-based data.
@@ -39,10 +28,14 @@
 parser = argparse.ArgumentParser(description=HELPTEXT)
 
 parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
+parser.add_argument("--tasks_info", type=str, help="path to tasks info file")
+parser.add_argument("--participant_id", type=str, help="participant id")
 
 args = parser.parse_args()
 
 dataset_info_file = args.dataset_info
+tasks_info_file = args.tasks_info
+participant_id = args.participant_id
 
 # Read dataset info
 with open(dataset_info_file, "r") as f:
@@ -58,128 +51,34 @@
 else:
     output_root = dataset_info["roi_root"]
 
-# create a subject id list
-subj_list = [f"sub-{i:04d}" for i in range(1, n_subj + 1)]
-
-job_id = os.getenv("SGE_TASK_ID")  # for SGE
-if job_id is None:
-    job_id = os.getenv("SLURM_ARRAY_TASK_ID")  # for SLURM
-job_id = int(job_id)
-subj_id = subj_list[job_id - 1]  # TASK_ID starts from 1 not 0
-
-print(f"subject-level simulation started running ... for subject: {subj_id} ...")
-
-all_task_info = {
-    "task-lowFreqLongRest": {
-        "task_name": "task-lowFreqLongRest",
-        "onset_time": onset_time,
-        "task_duration": 8.0,
-        "task_block_duration": 20.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-lowFreqShortRest": {
-        "task_name": "task-lowFreqShortRest",
-        "onset_time": onset_time,
-        "task_duration": 12.0,
-        "task_block_duration": 20.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-lowFreqShortTask": {
-        "task_name": "task-lowFreqShortTask",
-        "onset_time": onset_time,
-        "task_duration": 1.0,
-        "task_block_duration": 20.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-highFreqLongRest": {
-        "task_name": "task-highFreqLongRest",
-        "onset_time": onset_time,
-        "task_duration": 1.0,
-        "task_block_duration": 5.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-highFreqShortRest": {
-        "task_name": "task-highFreqShortRest",
-        "onset_time": onset_time,
-        "task_duration": 4.0,
-        "task_block_duration": 5.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-lowFreqShortRestDominStimul": {
-        "task_name": "task-lowFreqShortRestDominStimul",
-        "onset_time": onset_time,
-        "task_duration": 12.0,
-        "task_block_duration": 20.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 26,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-    "task-lowFreqShortRestNoisy": {
-        "task_name": "task-midFreqMidRestNoisy",
-        "onset_time": onset_time,
-        "task_duration": 12.0,
-        "task_block_duration": 20.0,
-        "sim_length": sim_length,
-        "BOLD_period": BOLD_period,
-        "TAVG_period": TAVG_period,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": global_conn_coupling_coef,
-        "D": D * 100,
-        "conn_speed": conn_speed,
-        "dt": dt,
-    },
-}
-
-for task in all_task_info:
-
-    time_series, task_data = simul_utils.simulate_task_data(subj_id, all_task_info[task])
+# Read tasks info
+with open(tasks_info_file, "r") as f:
+    all_tasks_info = json.load(f)
+
+print(f"subject-level simulation started running ... for subject: {participant_id} ...")
+
+for task in all_tasks_info:
+
+    # the task_data file might not exist for some subjects, so we use a try-except block
+    try:
+        time_series, task_data = simul_utils.simulate_task_data(
+            participant_id, all_tasks_info[task]
+        )
+    except Exception as e:
+        print(f"Error simulating task {task} for participant {participant_id}: {e}")
+        continue
 
     # save the time series and task data
-    output_file_prefix = f"{subj_id}_{task}"
-    if not os.path.exists(f"{output_root}/{subj_id}/"):
-        os.makedirs(f"{output_root}/{subj_id}/")
-    np.save(f"{output_root}/{subj_id}/{output_file_prefix}_time-series.npy", time_series)
-    np.save(f"{output_root}/{subj_id}/{output_file_prefix}_task-data.npy", task_data)
+    output_file_prefix = f"{participant_id}_{task}"
+    if not os.path.exists(f"{output_root}/{participant_id}/"):
+        os.makedirs(f"{output_root}/{participant_id}/")
+    np.save(
+        f"{output_root}/{participant_id}/{output_file_prefix}_time-series.npy",
+        time_series,
+    )
+    np.save(
+        f"{output_root}/{participant_id}/{output_file_prefix}_task-data.npy", task_data
+    )
 
 print("****************** DONE ******************")
 ####################################################################################

From 616de068e6182857b3d2840c1a1772621baff625 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Jun 2025 23:44:43 -0400
Subject: [PATCH 212/401] add GMM task presence

---
 pydfc/ml_utils.py   |  20 +++----
 pydfc/task_utils.py | 132 ++++++++++++++++++++++++++------------------
 2 files changed, 85 insertions(+), 67 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index cad9abf..39e16e7 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -41,7 +41,6 @@
     calc_rest_duration,
     calc_task_duration,
     calc_transition_freq,
-    extract_abs_task_presence,
     extract_task_presence,
 )
 
@@ -189,14 +188,15 @@ def extract_task_features(TASKS, RUNS, session, roi_root, dFC_root, no_hrf=False
                 Fs_task = task_data["Fs_task"]
                 TR_task = 1 / Fs_task
 
-                task_presence = extract_task_presence(
+                task_presence, indices = extract_task_presence(
                     event_labels=task_data["event_labels"],
                     TR_task=TR_task,
                     TR_mri=task_data["TR_mri"],
                     binary=True,
-                    binarizing_method="shift",
+                    binarizing_method="GMM",
                     no_hrf=no_hrf,
                 )
+                task_presence = task_presence[indices]
 
                 relative_task_on = calc_relative_task_on(task_presence)
                 # task duration
@@ -244,19 +244,13 @@ def dFC_feature_extraction_subj_lvl(
     dFC_vecs = dFC_mat2vec(dFC_mat)
 
     # event data
-    # task_presence = extract_task_presence(
-    #     event_labels=task_data["event_labels"],
-    #     TR_task=1 / task_data["Fs_task"],
-    #     TR_mri=task_data["TR_mri"],
-    #     TR_array=TR_array,
-    #     binary=True,
-    #     binarizing_method="shift",
-    # )
-    abs_task_presence, indices = extract_abs_task_presence(
+    task_presence, indices = extract_task_presence(
         event_labels=task_data["event_labels"],
         TR_task=1 / task_data["Fs_task"],
         TR_mri=task_data["TR_mri"],
         TR_array=TR_array,
+        binary=True,
+        binarizing_method="GMM",
     )
 
     # features = dFC_vecs
@@ -264,7 +258,7 @@ def dFC_feature_extraction_subj_lvl(
 
     # use absolute task presence
     features = dFC_vecs[indices, :]
-    target = abs_task_presence.ravel()
+    target = task_presence.ravel()[indices]
 
     assert (
         features.shape[0] == target.shape[0]
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index bbdff4e..b6a6e32 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -12,6 +12,7 @@
 import numpy as np
 from nilearn import glm
 from scipy import signal
+from sklearn.mixture import GaussianMixture
 
 from .dfc_utils import TR_intersection, rank_norm, visualize_conn_mat
 
@@ -285,13 +286,52 @@ def shifted_binarizing(
     return threshold
 
 
+def GMM_binarizing(
+    event_labels_all_task_hrf,
+    threshold=0.1,
+    downsample=True,
+    TR_mri=None,
+    TR_task=None,
+    TR_array=None,
+):
+    event_labels_all_task_hrf = event_labels_all_task_hrf.copy()
+    event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf.reshape(-1, 1)
+    # Fit GMM
+    gmm = GaussianMixture(n_components=2, n_init=5).fit(
+        event_labels_all_task_hrf_reshaped
+    )
+    # downsample to MRI TR
+    if downsample:
+        event_labels_all_task_hrf_reshaped = downsample_events_hrf(
+            event_labels_all_task_hrf_reshaped, TR_mri, TR_task
+        )
+    # some dFC measures (window-based) have a different TR than the task data
+    if TR_array is not None:
+        event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf_reshaped[TR_array]
+    # now predict on vs. off for the downsampled time points
+    probs = gmm.predict_proba(event_labels_all_task_hrf_reshaped)
+    # Identify which component corresponds to "on" (higher mean)
+    # Each component has a mean, and in this case:
+    # The "off" state should have a lower mean (closer to baseline).
+    # The "on" state should have a higher mean (HRF-convolved signal is elevated during task).
+    means = gmm.means_.flatten()
+    on_component = np.argmax(means)
+    # Get probability of being in the "on" state
+    p_on = probs[:, on_component]
+    # Create a binarized signal with transition points discarded
+    indices = np.where((p_on <= threshold) | (p_on >= (1 - threshold)))[0]
+    task_presence = np.where(p_on >= (1 - threshold), 1, 0)
+
+    return task_presence, indices
+
+
 def extract_task_presence(
     event_labels,
     TR_task,
     TR_mri,
     TR_array=None,
     binary=True,
-    binarizing_method="median",
+    binarizing_method="GMM",
     no_hrf=False,
 ):
     """
@@ -303,7 +343,7 @@ def extract_task_presence(
     This function extracts the task presence from the event labels and returns it in the same time points as the dFC data
     It also downsamples the task presence to the time points of the dFC data
     if binary is True, the task presence is binarized using the mean of the task presence
-    binarizing_method: 'median' or 'mean' or 'shift'
+    binarizing_method: 'median' or 'mean' or 'shift' or 'GMM'
     if binarizing_method is 'shift', the task presence is binarized such that the ratio of 1 to 0 is equal to the task presence ratio
 
     if no_hrf is True, the task presence is not convolved with HRF
@@ -328,70 +368,54 @@ def extract_task_presence(
     if binary:
         if binarizing_method == "median":
             threshold = np.median(event_labels_all_task_hrf)
+            task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
+            task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
+            # some dFC measures (window-based) have a different TR than the task data
+            if TR_array is not None:
+                task_presence = task_presence[TR_array]
+            indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "mean":
             threshold = np.mean(event_labels_all_task_hrf)
+            task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
+            task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
+            # some dFC measures (window-based) have a different TR than the task data
+            if TR_array is not None:
+                task_presence = task_presence[TR_array]
+            indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "shift":
             task_presence_ratio = np.mean(event_labels_all_task)
             threshold = shifted_binarizing(
                 event_labels_all_task_hrf=event_labels_all_task_hrf,
                 task_presence_ratio=task_presence_ratio,
             )
+            task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
+            task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
+            # some dFC measures (window-based) have a different TR than the task data
+            if TR_array is not None:
+                task_presence = task_presence[TR_array]
+            indices = np.arange(task_presence.shape[0])
+        elif binarizing_method == "GMM":
+            task_presence, indices = GMM_binarizing(
+                event_labels_all_task_hrf,
+                threshold=0.1,
+                downsample=True,
+                TR_mri=TR_mri,
+                TR_task=TR_task,
+                TR_array=TR_array,
+            )
         else:
-            raise ValueError("binarizing_method should be 'median', 'mean' or 'shift'")
-        task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
+            raise ValueError(
+                "binarizing_method should be 'median', 'mean', 'shift', or 'GMM'"
+            )
     else:
         task_presence = event_labels_all_task_hrf
+        task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
+        # some dFC measures (window-based) have a different TR than the task data
+        if TR_array is not None:
+            task_presence = task_presence[TR_array]
+        indices = np.arange(task_presence.shape[0])
 
-    task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
-
-    # some dFC measures (window-based) have a different TR than the task data
-    if TR_array is not None:
-        task_presence = task_presence[TR_array]
-
-    return task_presence
-
-
-def extract_abs_task_presence(
-    event_labels,
-    TR_task,
-    TR_mri,
-    TR_array=None,
-):
-    """
-    event_labels: event labels including 0 and event ids at the time each event happens
-    TR_task: TR of task
-    TR_mri: TR of MRI
-    TR_array: the time points of the dFC data, optional
-
-    This function considers time points above task_presence_shift as task presence
-    and time points below task_presence_shift as rest and discards the ones in the
-    grey area between them. It also returns the indices of time points that are
-    kept.
-    """
-    task_presence_mean = extract_task_presence(
-        event_labels=event_labels,
-        TR_task=TR_task,
-        TR_mri=TR_mri,
-        TR_array=TR_array,
-        binary=True,
-        binarizing_method="mean",
-        no_hrf=False,
-    )
-    task_presence_shift = extract_task_presence(
-        event_labels=event_labels,
-        TR_task=TR_task,
-        TR_mri=TR_mri,
-        TR_array=TR_array,
-        binary=True,
-        binarizing_method="shift",
-        no_hrf=False,
-    )
-    indices = np.where((task_presence_mean == 0) | (task_presence_shift == 1))[0]
-
-    abs_task_presence = task_presence_shift.copy()
-    abs_task_presence = abs_task_presence[indices]
-
-    return abs_task_presence, indices
+    return task_presence, indices
 
 
 ################################# Task Features ####################################

From ea3adf7c3147350afaf7d19f260daea5f101ad7a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 20 Jun 2025 23:46:36 -0400
Subject: [PATCH 213/401] minor

---
 pydfc/task_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index b6a6e32..f514f05 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -397,7 +397,7 @@ def extract_task_presence(
         elif binarizing_method == "GMM":
             task_presence, indices = GMM_binarizing(
                 event_labels_all_task_hrf,
-                threshold=0.1,
+                threshold=0.01,
                 downsample=True,
                 TR_mri=TR_mri,
                 TR_task=TR_task,

From 1a1095c66dbf9a82dfac5b1141e94c4205b50e90 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 21 Jun 2025 00:04:39 -0400
Subject: [PATCH 214/401] fix bug

---
 pydfc/task_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index f514f05..176e2b5 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -244,8 +244,9 @@ def downsample_events_hrf(events_hrf, TR_mri, TR_task, method="uniform"):
     the shape of events_hrf is (num_time_task, num_event_types) or (num_time_task,)
     the shape of the downsampled events_hrf is (num_time_mri, num_event_types)
     """
+    flag = False
     if len(events_hrf.shape) == 1:
-        flag = 1
+        flag = True
         events_hrf = np.expand_dims(events_hrf, axis=1)
     events_hrf_ds = []
     for i in range(events_hrf.shape[1]):

From 3912ee2530be85ed458701c937077ba415de3157 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 21 Jun 2025 00:21:09 -0400
Subject: [PATCH 215/401] update task presence in report

---
 task_dFC/generate_report.py | 56 ++++++++++---------------------------
 1 file changed, 14 insertions(+), 42 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index f71bcc0..3bbc88c 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -407,19 +407,19 @@ def plot_task_presence(
     TR_mri = task_data["TR_mri"]
     Fs_mri = 1 / TR_mri
 
-    task_presence_non_binarized = task_utils.extract_task_presence(
+    task_presence_non_binarized, _ = task_utils.extract_task_presence(
         event_labels=task_data["event_labels"],
         TR_task=TR_task,
         TR_mri=task_data["TR_mri"],
         binary=False,
     )
 
-    task_presence = task_utils.extract_task_presence(
+    task_presence, indices = task_utils.extract_task_presence(
         event_labels=task_data["event_labels"],
         TR_task=TR_task,
         TR_mri=task_data["TR_mri"],
         binary=True,
-        binarizing_method="shift",
+        binarizing_method="GMM",
     )
 
     time = np.arange(0, task_presence.shape[0]) / Fs_mri
@@ -433,6 +433,17 @@ def plot_task_presence(
         time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
     )
     plt.plot(time[start_TR:end_TR], task_presence[start_TR:end_TR], linewidth=4)
+    # Define local time and signal
+    time_new = time[start_TR:end_TR]
+    task_presence_new = task_presence_non_binarized[start_TR:end_TR]
+
+    # Find indices that are BOTH in the range and in indices
+    all_range = np.arange(start_TR, end_TR)
+    local_indices = np.where(np.isin(all_range, indices))[
+        0
+    ]  # relative to the start_TR:end_TR slice
+
+    plt.scatter(time_new[local_indices], task_presence_new[local_indices], color="orange")
 
     # put vertical lines at the start of each TR
     for TR in range(start_TR, end_TR):
@@ -466,45 +477,6 @@ def plot_task_presence(
     plt.close()
 
 
-def calculate_subj_lvl_task_presence_characteristics(
-    roi_root,
-    subj,
-    task,
-    run=None,
-    session=None,
-):
-    task_data = load_task_data(roi_root, subj, task, run, session)
-    Fs_task = task_data["Fs_task"]
-    TR_task = 1 / Fs_task
-
-    task_presence = task_utils.extract_task_presence(
-        event_labels=task_data["event_labels"],
-        TR_task=TR_task,
-        TR_mri=task_data["TR_mri"],
-        binary=True,
-        binarizing_method="shift",
-    )
-    relative_task_on = task_utils.calc_relative_task_on(task_presence)
-    # task duration
-    avg_task_duration, var_task_duration = task_utils.calc_task_duration(
-        task_presence, task_data["TR_mri"]
-    )
-    # rest duration
-    avg_rest_duration, var_rest_duration = task_utils.calc_rest_duration(
-        task_presence, task_data["TR_mri"]
-    )
-    # freq of transitions
-    num_of_transitions, relative_transition_freq = task_utils.calc_transition_freq(
-        task_presence
-    )
-
-    print(f"Relative task on: {relative_task_on}")
-    print(f"Average task duration: {avg_task_duration} seconds")
-    print(f"Average rest duration: {avg_rest_duration} seconds")
-    print(f"Number of transitions: {num_of_transitions}")
-    print(f"Relative transition frequency: {relative_transition_freq}")
-
-
 # def plot_FCS():
 #     visualize_FCS(
 #         measure,

From 59876d9ca7476aa5786190c8e1f31d11de12abd2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 21 Jun 2025 00:26:08 -0400
Subject: [PATCH 216/401] minor

---
 pydfc/task_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 176e2b5..ea8589a 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -398,7 +398,7 @@ def extract_task_presence(
         elif binarizing_method == "GMM":
             task_presence, indices = GMM_binarizing(
                 event_labels_all_task_hrf,
-                threshold=0.01,
+                threshold=0.1,
                 downsample=True,
                 TR_mri=TR_mri,
                 TR_task=TR_task,

From 9e8cfe948a8880b03ac14293c71e8a61a1995459 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 21 Jun 2025 00:46:03 -0400
Subject: [PATCH 217/401] add fallback_threshold to GMM

---
 pydfc/task_utils.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index ea8589a..5342be2 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -289,7 +289,7 @@ def shifted_binarizing(
 
 def GMM_binarizing(
     event_labels_all_task_hrf,
-    threshold=0.1,
+    threshold=0.01,
     downsample=True,
     TR_mri=None,
     TR_task=None,
@@ -323,6 +323,26 @@ def GMM_binarizing(
     indices = np.where((p_on <= threshold) | (p_on >= (1 - threshold)))[0]
     task_presence = np.where(p_on >= (1 - threshold), 1, 0)
 
+    # check that both classes are non-empty
+    unique_labels = np.unique(task_presence[indices])
+    if len(unique_labels) < 2:
+        fallback_threshold = 0.10
+        indices = np.where(
+            (p_on <= fallback_threshold) | (p_on >= (1 - fallback_threshold))
+        )[0]
+        task_presence = np.where(p_on >= (1 - fallback_threshold), 1, 0)
+
+        # Re-check after fallback
+        unique_labels = np.unique(task_presence[indices])
+        if len(unique_labels) < 2:
+            warnings.warn(
+                f"Even with fallback threshold={fallback_threshold}, only one class present in confident samples."
+            )
+        else:
+            warnings.warn(
+                f"Only one class detected at threshold={threshold}, falling back to threshold={fallback_threshold}."
+            )
+
     return task_presence, indices
 
 
@@ -398,7 +418,7 @@ def extract_task_presence(
         elif binarizing_method == "GMM":
             task_presence, indices = GMM_binarizing(
                 event_labels_all_task_hrf,
-                threshold=0.1,
+                threshold=0.01,
                 downsample=True,
                 TR_mri=TR_mri,
                 TR_task=TR_task,

From 119ed98f6a40460885ce06a21ebbda3d66cb41d3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 24 Jun 2025 14:29:48 -0400
Subject: [PATCH 218/401] add LE_transform_dFC

---
 pydfc/ml_utils.py | 47 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 39e16e7..555abe4 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 from scipy.spatial import procrustes
-from scipy.stats import zscore
 from sklearn.base import clone
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
@@ -697,6 +696,42 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
     return X_embed
 
 
+def LE_transform_dFC(X, n_components, n_neighbors, distance_metric="euclidean"):
+    """
+    Transform dFC features into a lower dimensional space using Laplacian Eigenmaps (LE).
+    This function takes care of the case where the dFC samples are not unique,
+    specifically for state-based dFC features.
+    """
+    unique_samples = np.unique(X, axis=0)
+    # if there are repeated samples, we need to apply LE on the unique samples
+    if unique_samples.shape[0] == X.shape[0]:
+        # if all samples are unique, we can apply LE directly on the data
+        X_embedded = LE_transform(
+            X=X,
+            n_components=n_components,
+            n_neighbors=n_neighbors,
+            distance_metric=distance_metric,
+        )
+    else:
+        n_neighbors_LE = int(3 / 5 * unique_samples.shape[0])
+        unique_samples_embedded = LE_transform(
+            X=unique_samples,
+            n_components=n_components,
+            n_neighbors=n_neighbors_LE,
+            distance_metric=distance_metric,
+        )
+
+        # for each entry in X, put the corresponding entry in unique_samples_embedded
+        # in the corresponding position in X_embedded
+        X_embedded = np.zeros((X.shape[0], n_components))
+        for i, sample in enumerate(unique_samples):
+            idx = np.where((X == sample).all(axis=1))[0]
+            if len(idx) > 0:
+                X_embedded[idx] = unique_samples_embedded[i]
+
+    return X_embedded
+
+
 def LE_embed_procustes(
     X_train,
     X_test,
@@ -720,7 +755,7 @@ def LE_embed_procustes(
             ), f"Indices of {subject} are not consecutive"
             X_subj = X_train[subj_label_train == subject, :]
             y_subj = y_train[subj_label_train == subject]
-            X_subj_embed = LE_transform(
+            X_subj_embed = LE_transform_dFC(
                 X=X_subj,
                 n_components=n_components,
                 n_neighbors=n_neighbors_LE,
@@ -768,7 +803,7 @@ def LE_embed_procustes(
                 np.diff(np.where(subj_label_test == subject)[0]) == 1
             ), f"Indices of {subject} are not consecutive"
             X_subj = X_test[subj_label_test == subject, :]
-            X_subj_embed = LE_transform(
+            X_subj_embed = LE_transform_dFC(
                 X=X_subj,
                 n_components=n_components,
                 n_neighbors=n_neighbors_LE,
@@ -797,7 +832,7 @@ def LE_embed_procustes(
                 np.diff(np.where(subj_label_train == subject)[0]) == 1
             ), f"Indices of {subject} are not consecutive"
             X_subj = X_train[subj_label_train == subject, :]
-            X_subj_embed = LE_transform(
+            X_subj_embed = LE_transform_dFC(
                 X=X_subj,
                 n_components=n_components,
                 n_neighbors=n_neighbors_LE,
@@ -824,7 +859,7 @@ def LE_embed_procustes(
         X_test_embed = None
         for subject in test_subjects:
             X_subj = X_test[subj_label_test == subject, :]
-            X_subj_embed = LE_transform(
+            X_subj_embed = LE_transform_dFC(
                 X=X_subj,
                 n_components=n_components,
                 n_neighbors=n_neighbors_LE,
@@ -916,7 +951,7 @@ def embed_dFC_features(
                 X_concat = np.concatenate((X_train, X_test), axis=0)
             else:
                 X_concat = X_train
-            X_concat_embed = LE_transform(
+            X_concat_embed = LE_transform_dFC(
                 X=X_concat,
                 n_components=n_components,
                 n_neighbors=n_neighbors_LE,

From f72c8cef83bd0e1ee34378af78430563734bc80b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 25 Jun 2025 20:25:40 -0400
Subject: [PATCH 219/401] minor fix

---
 pydfc/ml_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 555abe4..8802cf2 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -723,7 +723,7 @@ def LE_transform_dFC(X, n_components, n_neighbors, distance_metric="euclidean"):
 
         # for each entry in X, put the corresponding entry in unique_samples_embedded
         # in the corresponding position in X_embedded
-        X_embedded = np.zeros((X.shape[0], n_components))
+        X_embedded = np.zeros((X.shape[0], unique_samples_embedded.shape[1]))
         for i, sample in enumerate(unique_samples):
             idx = np.where((X == sample).all(axis=1))[0]
             if len(idx) > 0:

From 0f02d716018aee9754819de630d6445e16f53d31 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 26 Jun 2025 12:16:30 -0400
Subject: [PATCH 220/401] switch to "concat+embed" for state-based

---
 pydfc/ml_utils.py           | 29 ++++++++++++++++++++---------
 task_dFC/generate_report.py |  4 ++--
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 8802cf2..a2c8518 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -704,15 +704,7 @@ def LE_transform_dFC(X, n_components, n_neighbors, distance_metric="euclidean"):
     """
     unique_samples = np.unique(X, axis=0)
     # if there are repeated samples, we need to apply LE on the unique samples
-    if unique_samples.shape[0] == X.shape[0]:
-        # if all samples are unique, we can apply LE directly on the data
-        X_embedded = LE_transform(
-            X=X,
-            n_components=n_components,
-            n_neighbors=n_neighbors,
-            distance_metric=distance_metric,
-        )
-    else:
+    if unique_samples.shape[0] < X.shape[0] // 2:
         n_neighbors_LE = int(3 / 5 * unique_samples.shape[0])
         unique_samples_embedded = LE_transform(
             X=unique_samples,
@@ -728,6 +720,14 @@ def LE_transform_dFC(X, n_components, n_neighbors, distance_metric="euclidean"):
             idx = np.where((X == sample).all(axis=1))[0]
             if len(idx) > 0:
                 X_embedded[idx] = unique_samples_embedded[i]
+    else:
+        # if all samples are unique, we can apply LE directly on the data
+        X_embedded = LE_transform(
+            X=X,
+            n_components=n_components,
+            n_neighbors=n_neighbors,
+            distance_metric=distance_metric,
+        )
 
     return X_embedded
 
@@ -900,6 +900,7 @@ def embed_dFC_features(
     All the subjects are transformed into the space of the subject with the highest silhouette score.
 
     LE_embedding_method: "concat+embed" or "embed+procrustes"
+    if the dFC features are not unique (state-based), "embed+procrustes" will not work. So this function will switch to "concat+embed" method.
     """
     # make a copy of the data
     X_train = X_train.copy()
@@ -919,6 +920,13 @@ def embed_dFC_features(
         else:
             X_test_embed = None
     elif embedding == "LE":
+        # if the dFC features are not unique (state-based), set the LE_embedding_method to "concat+embed"
+        if np.unique(X_train, axis=0).shape[0] < X_train.shape[0] // 2:
+            if LE_embedding_method == "embed+procrustes":
+                warnings.warn(
+                    "The dFC features are not unique (state-based). Switching to 'concat+embed' method."
+                )
+                LE_embedding_method = "concat+embed"
         # if n_components is not specified, find the intrinsic dimension of the data using training set and based on the silhouette score
         if n_components == "auto":
             n_components = find_intrinsic_dim(
@@ -947,6 +955,9 @@ def embed_dFC_features(
             )
         elif LE_embedding_method == "concat+embed":
             # since SpectralEmbedding does not have transform method, we need to fit the LE on the whole data
+            # but note that this method is used mostly for state-based dFC features, and in this case the
+            # samples are the same across subjects, so we can concatenate the training and test sets
+            # and then apply LE on the concatenated data
             if X_test is not None:
                 X_concat = np.concatenate((X_train, X_test), axis=0)
             else:
diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 3bbc88c..27df6cb 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -618,7 +618,7 @@ def plot_ML_results(
         "balanced accuracy",
         "precision",
         "recall",
-        "f1",
+        # "f1",
         # "tp",
         # "tn",
         # "fp",
@@ -1103,7 +1103,7 @@ def create_html_report_group_results(
         "balanced accuracy",
         "precision",
         "recall",
-        "f1",
+        # "f1",
         # "tp",
         # "tn",
         # "fp",

From 29a9ece828c9725500bd3043ea48f0233e3132f0 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 26 Jun 2025 12:54:19 -0400
Subject: [PATCH 221/401] minor

---
 task_dFC/generate_report.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 27df6cb..bf06057 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -442,6 +442,9 @@ def plot_task_presence(
     local_indices = np.where(np.isin(all_range, indices))[
         0
     ]  # relative to the start_TR:end_TR slice
+    print(f"Local indices: {local_indices}")
+    print(f"Indices: {indices}")
+    print(f"all_range: {all_range}")
 
     plt.scatter(time_new[local_indices], task_presence_new[local_indices], color="orange")
 

From 9b5f90126c1faa810af56623e21887fd66c57be4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 26 Jun 2025 13:46:23 -0400
Subject: [PATCH 222/401] minor

---
 task_dFC/generate_report.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index bf06057..783b92e 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -446,7 +446,7 @@ def plot_task_presence(
     print(f"Indices: {indices}")
     print(f"all_range: {all_range}")
 
-    plt.scatter(time_new[local_indices], task_presence_new[local_indices], color="orange")
+    plt.scatter(time_new[local_indices], task_presence_new[local_indices], color="brown")
 
     # put vertical lines at the start of each TR
     for TR in range(start_TR, end_TR):

From 3826f1f693e898897582e2368bde064bb1fccab2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 26 Jun 2025 23:11:35 -0400
Subject: [PATCH 223/401] show kept TRs for report

---
 task_dFC/generate_report.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/task_dFC/generate_report.py b/task_dFC/generate_report.py
index 783b92e..3d33afc 100644
--- a/task_dFC/generate_report.py
+++ b/task_dFC/generate_report.py
@@ -433,24 +433,13 @@ def plot_task_presence(
         time[start_TR:end_TR], task_presence_non_binarized[start_TR:end_TR], linewidth=4
     )
     plt.plot(time[start_TR:end_TR], task_presence[start_TR:end_TR], linewidth=4)
-    # Define local time and signal
-    time_new = time[start_TR:end_TR]
-    task_presence_new = task_presence_non_binarized[start_TR:end_TR]
-
-    # Find indices that are BOTH in the range and in indices
-    all_range = np.arange(start_TR, end_TR)
-    local_indices = np.where(np.isin(all_range, indices))[
-        0
-    ]  # relative to the start_TR:end_TR slice
-    print(f"Local indices: {local_indices}")
-    print(f"Indices: {indices}")
-    print(f"all_range: {all_range}")
-
-    plt.scatter(time_new[local_indices], task_presence_new[local_indices], color="brown")
 
     # put vertical lines at the start of each TR
     for TR in range(start_TR, end_TR):
-        plt.axvline(x=TR * TR_mri, color="r", linestyle="--")
+        if TR in indices:
+            plt.axvline(x=TR * TR_mri, color="g", linestyle="--")
+        else:
+            plt.axvline(x=TR * TR_mri, color="r", linestyle="--")
     # show TR labels on the red lines with a small font and at the top
     for TR in range(start_TR, end_TR):
         plt.text(TR * TR_mri, 1.2, f"TR {TR}", fontsize=8, color="black", ha="center")

From 4017283c041e0ba53dda159cf45a2e1cca1f70a3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 30 Jun 2025 22:10:26 -0400
Subject: [PATCH 224/401] alter_hparams in FCS estimate

---
 task_dFC/FCS_estimate.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index bf04818..f254134 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -65,10 +65,22 @@ def run_FCS_estimate(
     )
 
     # in this script we process only one measure
-    assert len(MEASURES_lst) == 1, "Only one measure should be processed in this script"
-    # and we assume alter_hparams is empty
-    # if not, we need to change the naming of the output files
-    assert len(alter_hparams) == 0, "alter_hparams is assumed to be empty in this script"
+    # if alter_hparams is not empty, we need to change the naming of the output files
+    # to differentiate between the measures
+    if len(MEASURES_lst) == 1:
+        only_one_measure = True
+    else:
+        only_one_measure = False
+
+    if not only_one_measure:
+        # we assume only one hyperparameter is altered
+        # alter_hparams is a dictionary with one key
+        # ow change the naming of the output files
+        assert len(alter_hparams) == 1, (
+            "alter_hparams should have only one key, "
+            "but got more than one. This script is designed to process only one hyperparameter."
+        )
+        hyper_param_name = [key for key in alter_hparams.keys()][0]
 
     tic = time.time()
     print("Measurement Started ...")
@@ -83,9 +95,10 @@ def run_FCS_estimate(
         backend=params_multi_analysis["backend"],
     )
 
-    assert (
-        len(MEASURES_fit_lst) == 1
-    ), "Only one measure should be processed in this script"
+    if only_one_measure:
+        assert (
+            len(MEASURES_fit_lst) == 1
+        ), "Only one measure should be processed, but got more than one."
 
     # Save the fitted measures
     for measure in MEASURES_fit_lst:
@@ -94,7 +107,10 @@ def run_FCS_estimate(
                 os.makedirs(f"{output_dir}")
         except OSError as err:
             print(err)
-        measure_name = MEASURES_name_lst[0]
+        if only_one_measure:
+            measure_name = measure.measure_name
+        else:
+            measure_name = f"{measure.measure_name}-{hyper_param_name}-{measure.params[hyper_param_name]}"
         np.save(f"{output_dir}/MEASURE_{file_suffix}_{measure_name}.npy", measure)
 
     print(f"Measurement required {time.time() - tic:0.3f} seconds.")

From 8a2298098030d6f5c04b1ee6b62f20c5cc35ebb2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 30 Jun 2025 22:44:53 -0400
Subject: [PATCH 225/401] add W to SW name in ML

---
 pydfc/ml_utils.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index a2c8518..1f58c44 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -341,7 +341,13 @@ def dFC_feature_extraction(
             y_train = np.concatenate((y_train, y_subj), axis=0)
 
         if dFC_measure_name is None:
-            dFC_measure_name = dFC.measure.measure_name
+            if dFC.measure.measure_name == "SlidingWindow":
+                # for SlidingWindow, we also put the W parameter in the measure name
+                dFC_measure_name = (
+                    f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
+                )
+            else:
+                dFC_measure_name = dFC.measure.measure_name
         else:
             assert (
                 dFC_measure_name == dFC.measure.measure_name
@@ -379,7 +385,13 @@ def dFC_feature_extraction(
             y_test = np.concatenate((y_test, y_subj), axis=0)
 
         if dFC_measure_name is None:
-            dFC_measure_name = dFC.measure.measure_name
+            if dFC.measure.measure_name == "SlidingWindow":
+                # for SlidingWindow, we also put the W parameter in the measure name
+                dFC_measure_name = (
+                    f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
+                )
+            else:
+                dFC_measure_name = dFC.measure.measure_name
         else:
             assert (
                 dFC_measure_name == dFC.measure.measure_name

From 7e6edb709626f09e7b1fc9bf3520321ddd3588ad Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 1 Jul 2025 14:52:04 -0400
Subject: [PATCH 226/401] minor bug

---
 pydfc/ml_utils.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 1f58c44..5c78288 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -340,17 +340,18 @@ def dFC_feature_extraction(
             X_train = np.concatenate((X_train, X_subj), axis=0)
             y_train = np.concatenate((y_train, y_subj), axis=0)
 
+        if dFC.measure.measure_name == "SlidingWindow":
+            # for SlidingWindow, we also put the W parameter in the measure name
+            dFC_measure_name_new = (
+                f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
+            )
+        else:
+            dFC_measure_name_new = dFC.measure.measure_name
         if dFC_measure_name is None:
-            if dFC.measure.measure_name == "SlidingWindow":
-                # for SlidingWindow, we also put the W parameter in the measure name
-                dFC_measure_name = (
-                    f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
-                )
-            else:
-                dFC_measure_name = dFC.measure.measure_name
+            dFC_measure_name = dFC_measure_name_new
         else:
             assert (
-                dFC_measure_name == dFC.measure.measure_name
+                dFC_measure_name == dFC_measure_name_new
             ), "dFC measure is not consistent."
 
     X_test = None
@@ -384,17 +385,18 @@ def dFC_feature_extraction(
             X_test = np.concatenate((X_test, X_subj), axis=0)
             y_test = np.concatenate((y_test, y_subj), axis=0)
 
+        if dFC.measure.measure_name == "SlidingWindow":
+            # for SlidingWindow, we also put the W parameter in the measure name
+            dFC_measure_name_new = (
+                f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
+            )
+        else:
+            dFC_measure_name_new = dFC.measure.measure_name
         if dFC_measure_name is None:
-            if dFC.measure.measure_name == "SlidingWindow":
-                # for SlidingWindow, we also put the W parameter in the measure name
-                dFC_measure_name = (
-                    f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
-                )
-            else:
-                dFC_measure_name = dFC.measure.measure_name
+            dFC_measure_name = dFC_measure_name_new
         else:
             assert (
-                dFC_measure_name == dFC.measure.measure_name
+                dFC_measure_name == dFC_measure_name_new
             ), "dFC measure is not consistent."
 
     # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

From 7732a03a30f2bb16365dac9f1bb837b3585a3506 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 7 Jul 2025 14:33:09 -0400
Subject: [PATCH 227/401] use FCS_proba as features for SB methods

---
 pydfc/ml_utils.py | 105 ++++++++++++++++++++++++++++++----------------
 1 file changed, 68 insertions(+), 37 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 5c78288..11a8e31 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -236,11 +236,25 @@ def dFC_feature_extraction_subj_lvl(
     dynamic_pred: "no", "past", "past_and_future"
     """
     # dFC features
-    dFC_mat = dFC.get_dFC_mat()
-    TR_array = dFC.TR_array
-    if normalize_dFC:
-        dFC_mat = rank_norm(dFC_mat)
-    dFC_vecs = dFC_mat2vec(dFC_mat)
+    # for state-based dFC, we use the FCS_proba as features
+    # for state-free dFC, we use the dFC matrix as features
+    if dFC.measure.is_state_based:
+        # state-based dFC
+        dFC_vecs = dFC.FCS_proba  # shape: (n_time, n_states)
+        TR_array = dFC.TR_array
+
+        assert dFC_vecs.shape[0] == len(
+            TR_array
+        ), "dFC_vecs and TR_array have different number of samples."
+        assert (
+            dFC_vecs.shape[1] == dFC.measure.params["n_states"]
+        ), "dFC_vecs and n_states are not consistent."
+    else:
+        dFC_mat = dFC.get_dFC_mat()
+        TR_array = dFC.TR_array
+        if normalize_dFC:
+            dFC_mat = rank_norm(dFC_mat)
+        dFC_vecs = dFC_mat2vec(dFC_mat)
 
     # event data
     task_presence, indices = extract_task_presence(
@@ -1452,23 +1466,32 @@ def task_presence_classification(
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features
-        try:
-            X_train_embedded, X_test_embedded = embed_dFC_features(
-                train_subjects=train_subjects,
-                test_subjects=test_subjects,
-                X_train=X_train,
-                X_test=X_test,
-                y_train=y_train,
-                y_test=y_test,
-                subj_label_train=subj_label_train,
-                subj_label_test=subj_label_test,
-                embedding=embedding,
-                n_components="auto",
-                n_neighbors_LE=125,
-                LE_embedding_method="embed+procrustes",
+        # if the number of features is smaller than 25, we assume that dimensionality reduction is not needed
+        # specially for state-based dFC features, the number of features is equal to the number of states
+        if X_train.shape[1] < 25:
+            X_train_embedded = X_train
+            X_test_embedded = X_test
+            print(
+                f"Number of features is {X_train.shape[1]}. No dimensionality reduction is applied."
             )
-        except:
-            continue
+        else:
+            try:
+                X_train_embedded, X_test_embedded = embed_dFC_features(
+                    train_subjects=train_subjects,
+                    test_subjects=test_subjects,
+                    X_train=X_train,
+                    X_test=X_test,
+                    y_train=y_train,
+                    y_test=y_test,
+                    subj_label_train=subj_label_train,
+                    subj_label_test=subj_label_test,
+                    embedding=embedding,
+                    n_components="auto",
+                    n_neighbors_LE=125,
+                    LE_embedding_method="embed+procrustes",
+                )
+            except:
+                continue
 
         # Silhouette score
         SI = {
@@ -1674,23 +1697,31 @@ def task_presence_clustering(
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features
-        try:
-            X_embedded, _ = embed_dFC_features(
-                train_subjects=SUBJECTS,
-                test_subjects=[],
-                X_train=X,
-                X_test=None,
-                y_train=y,
-                y_test=None,
-                subj_label_train=subj_label,
-                subj_label_test=None,
-                embedding=embedding,
-                n_components="auto",
-                n_neighbors_LE=125,
-                LE_embedding_method="embed+procrustes",
+        # if the number of features is smaller than 25, we assume that dimensionality reduction is not needed
+        # specially for state-based dFC features, the number of features is equal to the number of states
+        if X.shape[1] < 25:
+            X_embedded = X
+            print(
+                f"Number of features is {X.shape[1]}. No dimensionality reduction is applied."
             )
-        except:
-            continue
+        else:
+            try:
+                X_embedded, _ = embed_dFC_features(
+                    train_subjects=SUBJECTS,
+                    test_subjects=[],
+                    X_train=X,
+                    X_test=None,
+                    y_train=y,
+                    y_test=None,
+                    subj_label_train=subj_label,
+                    subj_label_test=None,
+                    embedding=embedding,
+                    n_components="auto",
+                    n_neighbors_LE=125,
+                    LE_embedding_method="embed+procrustes",
+                )
+            except:
+                continue
 
         # clustering
         # apply kmeans clustering to dFC features

From 623ebc384a20d347e3a127a78b5b4822497ea67f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 7 Jul 2025 18:43:13 -0400
Subject: [PATCH 228/401] parallel run_classification

---
 task_dFC/ML.py | 100 +++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 37 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 54f57f5..e8c7bdb 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -4,6 +4,7 @@
 import traceback
 
 import numpy as np
+from joblib import Parallel, delayed
 
 from pydfc.ml_utils import (
     cluster_for_visual,
@@ -12,6 +13,10 @@
     task_presence_clustering,
 )
 
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["NUMEXPR_NUM_THREADS"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
+
 #######################################################################################
 
 
@@ -63,6 +68,27 @@ def run_task_features_extraction(
             print(err)
 
 
+def classify_single_run(
+    task, run, session, dFC_id, roi_root, dFC_root, dynamic_pred, normalize_dFC
+):
+    try:
+        ML_scores_new = task_presence_classification(
+            task=task,
+            dFC_id=dFC_id,
+            roi_root=roi_root,
+            dFC_root=dFC_root,
+            run=run,
+            session=session,
+            dynamic_pred=dynamic_pred,
+            normalize_dFC=normalize_dFC,
+        )
+        return task, run, ML_scores_new
+    except Exception as e:
+        print(f"Error in task presence classification for {session} {task} {run}: {e}")
+        traceback.print_exc()
+        return task, run, None
+
+
 def run_classification(
     dFC_id,
     TASKS,
@@ -73,54 +99,54 @@ def run_classification(
     output_root,
     dynamic_pred="no",
     normalize_dFC=True,
+    n_jobs=-1,  # Number of parallel jobs; -1 = all available cores
 ):
     for session in SESSIONS:
-        if not session is None:
+        if session is not None:
             print(f"=================== {session} ===================")
 
         ML_scores = {
             "group_lvl": {},
             "subj_lvl": {},
         }
-        for task_id, task in enumerate(TASKS):
-            for run in RUNS[task]:
-                try:
-                    ML_scores_new = task_presence_classification(
-                        task=task,
-                        dFC_id=dFC_id,
-                        roi_root=roi_root,
-                        dFC_root=dFC_root,
-                        run=run,
-                        session=session,
-                        dynamic_pred=dynamic_pred,
-                        normalize_dFC=normalize_dFC,
-                    )
-                    # group level scores
-                    for key in ML_scores_new["group_lvl"]:
-                        if key not in ML_scores["group_lvl"]:
-                            ML_scores["group_lvl"][key] = list()
-                        ML_scores["group_lvl"][key].extend(
-                            ML_scores_new["group_lvl"][key]
-                        )
-                    # subject level scores
-                    for key in ML_scores_new["subj_lvl"]:
-                        if key not in ML_scores["subj_lvl"]:
-                            ML_scores["subj_lvl"][key] = list()
-                        ML_scores["subj_lvl"][key].extend(ML_scores_new["subj_lvl"][key])
 
-                except Exception as e:
-                    print(
-                        f"Error in task presence classification for {session} {task} {run}: {e}"
-                    )
-                    traceback.print_exc()
+        # Parallel execution
+        results = Parallel(n_jobs=n_jobs, verbose=0, backend="loky")(
+            delayed(classify_single_run)(
+                task,
+                run,
+                session,
+                dFC_id,
+                roi_root,
+                dFC_root,
+                dynamic_pred,
+                normalize_dFC,
+            )
+            for task in TASKS
+            for run in RUNS[task]
+        )
 
-        if session is None:
-            folder = f"{output_root}/classification"
-        else:
-            folder = f"{output_root}/classification/{session}"
+        # Aggregate results
+        for task, run, result in results:
+            if result is None:
+                continue
+            for key in result["group_lvl"]:
+                if key not in ML_scores["group_lvl"]:
+                    ML_scores["group_lvl"][key] = []
+                ML_scores["group_lvl"][key].extend(result["group_lvl"][key])
+            for key in result["subj_lvl"]:
+                if key not in ML_scores["subj_lvl"]:
+                    ML_scores["subj_lvl"][key] = []
+                ML_scores["subj_lvl"][key].extend(result["subj_lvl"][key])
+
+        # Save output
+        folder = (
+            f"{output_root}/classification"
+            if session is None
+            else f"{output_root}/classification/{session}"
+        )
         try:
-            if not os.path.exists(folder):
-                os.makedirs(folder)
+            os.makedirs(folder, exist_ok=True)
         except OSError as err:
             print(err)
 

From 65bab3083b196b7df53735ccb08f3342230b28d3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 7 Jul 2025 23:24:14 -0400
Subject: [PATCH 229/401] minor

---
 task_dFC/ML.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index e8c7bdb..ea857f2 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -378,6 +378,7 @@ def run_clustering_for_visual(
             output_root=ML_root,
             dynamic_pred="no",
             normalize_dFC=True,
+            n_jobs=6,
         )
     except Exception as e:
         print(f"Error in classification for dFC ID {dFC_id}: {e}")

From 0f70a0da74a0e74a04707db50387821beba12803 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 9 Jul 2025 10:37:59 -0400
Subject: [PATCH 230/401] add FCS_proba_for_SB

---
 pydfc/ml_utils.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 11a8e31..a4f156f 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -229,16 +229,20 @@ def dFC_feature_extraction_subj_lvl(
     task_data,
     dynamic_pred="no",
     normalize_dFC=True,
+    FCS_proba_for_SB=True,
 ):
     """
     Extract features and target for task presence classification
     for a single subject.
     dynamic_pred: "no", "past", "past_and_future"
+
+    FCS_proba_for_SB: if True, use FCS_proba as features for state-based dFC.
+    If False, use dFC_vecs (dFC matrix as features).
     """
     # dFC features
     # for state-based dFC, we use the FCS_proba as features
     # for state-free dFC, we use the dFC matrix as features
-    if dFC.measure.is_state_based:
+    if dFC.measure.is_state_based and FCS_proba_for_SB:
         # state-based dFC
         dFC_vecs = dFC.FCS_proba  # shape: (n_time, n_states)
         TR_array = dFC.TR_array
@@ -315,11 +319,15 @@ def dFC_feature_extraction(
     session=None,
     dynamic_pred="no",
     normalize_dFC=True,
+    FCS_proba_for_SB=True,
 ):
     """
     Extract features and target for task presence classification
     for all subjects.
     if run is specified, dFC results for that run will be used.
+
+    if FCS_proba_for_SB is True, use FCS_proba as features for state-based dFC.
+    If False, use dFC_vecs (dFC matrix as features).
     """
     dFC_measure_name = None
     X_train = None
@@ -344,6 +352,7 @@ def dFC_feature_extraction(
             task_data=task_data,
             dynamic_pred=dynamic_pred,
             normalize_dFC=normalize_dFC,
+            FCS_proba_for_SB=FCS_proba_for_SB,
         )
 
         subj_label_train.extend([subj for i in range(X_subj.shape[0])])
@@ -389,6 +398,7 @@ def dFC_feature_extraction(
             task_data=task_data,
             dynamic_pred=dynamic_pred,
             normalize_dFC=normalize_dFC,
+            FCS_proba_for_SB=FCS_proba_for_SB,
         )
 
         subj_label_test.extend([subj for i in range(X_subj.shape[0])])
@@ -1442,6 +1452,7 @@ def task_presence_classification(
             session=session,
             dynamic_pred=dynamic_pred,
             normalize_dFC=normalize_dFC,
+            FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
         )
     )
 
@@ -1683,6 +1694,7 @@ def task_presence_clustering(
         session=session,
         dynamic_pred="no",
         normalize_dFC=normalize_dFC,
+        FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
     )
 
     clustering_RESULTS = {"PCA": {}, "LE": {}}
@@ -1838,6 +1850,7 @@ def cluster_for_visual(
         session=session,
         dynamic_pred="no",
         normalize_dFC=normalize_dFC,
+        FCS_proba_for_SB=False,
     )
 
     # clustering

From fbbda114e641a045cc00e210bf384bf0e83e151d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 20:23:10 -0400
Subject: [PATCH 231/401] minor fix in SI_ID

---
 pydfc/ml_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index a4f156f..de28e34 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -631,6 +631,9 @@ def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125):
 
     SI_score = {}
     for n_components in search_range:
+        if n_components > X.shape[1]:
+            # if the number of components is larger than the number of features, break
+            break
         try:
             X_train_embed, _ = embed_dFC_features(
                 train_subjects=["subj"],

From 336486d95b3378730ff9f90ce64a0ea5bd72c55e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 20:27:24 -0400
Subject: [PATCH 232/401] minor

---
 pydfc/ml_utils.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index de28e34..69cd695 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -363,13 +363,7 @@ def dFC_feature_extraction(
             X_train = np.concatenate((X_train, X_subj), axis=0)
             y_train = np.concatenate((y_train, y_subj), axis=0)
 
-        if dFC.measure.measure_name == "SlidingWindow":
-            # for SlidingWindow, we also put the W parameter in the measure name
-            dFC_measure_name_new = (
-                f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
-            )
-        else:
-            dFC_measure_name_new = dFC.measure.measure_name
+        dFC_measure_name_new = dFC.measure.measure_name
         if dFC_measure_name is None:
             dFC_measure_name = dFC_measure_name_new
         else:
@@ -409,13 +403,7 @@ def dFC_feature_extraction(
             X_test = np.concatenate((X_test, X_subj), axis=0)
             y_test = np.concatenate((y_test, y_subj), axis=0)
 
-        if dFC.measure.measure_name == "SlidingWindow":
-            # for SlidingWindow, we also put the W parameter in the measure name
-            dFC_measure_name_new = (
-                f"{dFC.measure.measure_name}-W{dFC.measure.params['W']}"
-            )
-        else:
-            dFC_measure_name_new = dFC.measure.measure_name
+        dFC_measure_name_new = dFC.measure.measure_name
         if dFC_measure_name is None:
             dFC_measure_name = dFC_measure_name_new
         else:

From 763ef9289ed4964f3c71f9c14c41e8fa432c72d9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 22:03:44 -0400
Subject: [PATCH 233/401] minor

---
 pydfc/ml_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 69cd695..f9e5a0d 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1496,9 +1496,15 @@ def task_presence_classification(
                 continue
 
         # Silhouette score
+        # SI does not need to be separated for train and test sets
+        # we will use the same SI for both train and test sets
+        # using all samples from train and test sets
+        X_combined = np.concatenate((X_train_embedded, X_test_embedded), axis=0)
+        y_combined = np.concatenate((y_train, y_test), axis=0)
+
         SI = {
-            "train": silhouette_score(X_train_embedded, y_train),
-            "test": silhouette_score(X_test_embedded, y_test),
+            "train": silhouette_score(X_combined, y_combined),
+            "test": silhouette_score(X_combined, y_combined),
         }
 
         # task presence classification

From 45b848b06111d9b1d3773ce89218d7b47f7ebae4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 22:15:37 -0400
Subject: [PATCH 234/401] minor

---
 pydfc/ml_utils.py | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index f9e5a0d..2ef6c34 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1468,32 +1468,23 @@ def task_presence_classification(
     }
     for embedding in ["PCA", "LE"]:
         # embed dFC features
-        # if the number of features is smaller than 25, we assume that dimensionality reduction is not needed
-        # specially for state-based dFC features, the number of features is equal to the number of states
-        if X_train.shape[1] < 25:
-            X_train_embedded = X_train
-            X_test_embedded = X_test
-            print(
-                f"Number of features is {X_train.shape[1]}. No dimensionality reduction is applied."
+        try:
+            X_train_embedded, X_test_embedded = embed_dFC_features(
+                train_subjects=train_subjects,
+                test_subjects=test_subjects,
+                X_train=X_train,
+                X_test=X_test,
+                y_train=y_train,
+                y_test=y_test,
+                subj_label_train=subj_label_train,
+                subj_label_test=subj_label_test,
+                embedding=embedding,
+                n_components="auto",
+                n_neighbors_LE=125,
+                LE_embedding_method="embed+procrustes",
             )
-        else:
-            try:
-                X_train_embedded, X_test_embedded = embed_dFC_features(
-                    train_subjects=train_subjects,
-                    test_subjects=test_subjects,
-                    X_train=X_train,
-                    X_test=X_test,
-                    y_train=y_train,
-                    y_test=y_test,
-                    subj_label_train=subj_label_train,
-                    subj_label_test=subj_label_test,
-                    embedding=embedding,
-                    n_components="auto",
-                    n_neighbors_LE=125,
-                    LE_embedding_method="embed+procrustes",
-                )
-            except:
-                continue
+        except:
+            continue
 
         # Silhouette score
         # SI does not need to be separated for train and test sets

From 0c014454890aa3ee38f67c4af013b1cad4beacc4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 23:04:02 -0400
Subject: [PATCH 235/401] randomize struct conn and increase noise in simul

---
 pydfc/simul_utils.py                          | 13 +++++++
 .../tasks_info_ds003465.json                  |  8 ++--
 .../tasks_info_pulseTrain.json                | 38 +++----------------
 3 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index 0a00a48..79f00f7 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -122,6 +122,19 @@ def simulate_task_BOLD(
     ################################# Initialize Simulation ####################################
     conn = connectivity.Connectivity.from_file()
     conn.speed = np.array([conn_speed_rand])
+    conn.configure()
+    # randomize the structural connectivity
+    # Additive Gaussian noise (e.g. 10% of weight magnitude)
+    noise_level = 0.1  # 10%
+    conn.weights += np.random.normal(
+        loc=0,
+        scale=noise_level * np.std(conn.weights[conn.weights > 0]),
+        size=conn.weights.shape,
+    )
+    # Remove negative weights if any
+    conn.weights = np.clip(conn.weights, 0, None)
+    # reconfigure the connectivity
+    conn.configure()
 
     # configure stimulus spatial pattern
     if num_stimulated_regions == 5:
diff --git a/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json b/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
index 009e173..01d3a0e 100644
--- a/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
+++ b/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
@@ -5,7 +5,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -15,7 +15,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -25,7 +25,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -35,7 +35,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     }
diff --git a/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json b/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json
index 41967c2..9a153ec 100644
--- a/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json
+++ b/simul_dFC/run_scripts_slurm/tasks_info_pulseTrain.json
@@ -9,7 +9,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -23,7 +23,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -37,7 +37,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -51,7 +51,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     },
@@ -65,35 +65,7 @@
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
-        "conn_speed": 1.0,
-        "dt": 0.5
-    },
-    "task-lowFreqShortRestDominStimul": {
-        "task_name": "task-lowFreqShortRestDominStimul",
-        "onset_time": 20.0,
-        "task_duration": 12.0,
-        "task_block_duration": 20.0,
-        "sim_length": 250e3,
-        "BOLD_period": 500,
-        "TAVG_period": 1.0,
-        "num_stimulated_regions": 26,
-        "global_conn_coupling_coef": 0.0126,
-        "D": 0.01,
-        "conn_speed": 1.0,
-        "dt": 0.5
-    },
-    "task-lowFreqShortRestNoisy": {
-        "task_name": "task-midFreqMidRestNoisy",
-        "onset_time": 20.0,
-        "task_duration": 12.0,
-        "task_block_duration": 20.0,
-        "sim_length": 250e3,
-        "BOLD_period": 500,
-        "TAVG_period": 1.0,
-        "num_stimulated_regions": 5,
-        "global_conn_coupling_coef": 0.0126,
-        "D": 1.00,
+        "D": 0.1,
         "conn_speed": 1.0,
         "dt": 0.5
     }

From 796a89594f3f7c222e82236a9684ae8cd1ab5114 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 23:17:55 -0400
Subject: [PATCH 236/401] minor

---
 simul_dFC/task_data_simulator.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/simul_dFC/task_data_simulator.py b/simul_dFC/task_data_simulator.py
index 17059f9..df80ecc 100644
--- a/simul_dFC/task_data_simulator.py
+++ b/simul_dFC/task_data_simulator.py
@@ -7,6 +7,7 @@
 import argparse
 import json
 import os
+import traceback
 import warnings
 
 import numpy as np
@@ -66,6 +67,8 @@
         )
     except Exception as e:
         print(f"Error simulating task {task} for participant {participant_id}: {e}")
+        # print traceback
+        traceback.print_exc()
         continue
 
     # save the time series and task data

From ad4f2063322e875715ce5a5d76d0b9892dc7db09 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 10 Jul 2025 23:26:49 -0400
Subject: [PATCH 237/401] minor

---
 pydfc/simul_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/simul_utils.py b/pydfc/simul_utils.py
index 79f00f7..bf6d9a4 100644
--- a/pydfc/simul_utils.py
+++ b/pydfc/simul_utils.py
@@ -412,7 +412,7 @@ def simulate_task_data(subj_id, task_info):
             - dt: float
                 The simulation time step in milliseconds.
     """
-    if task_info["task_data"] is not None:
+    if "task_data" in task_info:
         # task_info["task_data"] is a path to a dictionary with {subj_id} as a placeholder
         if "{subj_id}" in task_info["task_data"]:
             task_data_path = task_info["task_data"].replace("{subj_id}", subj_id)

From d475670dc22580213f5fc8b4ea621f0687042d75 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 25 Jul 2025 13:38:19 -0400
Subject: [PATCH 238/401] minor in FCS

---
 task_dFC/FCS_estimate.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index f254134..15d9dff 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -2,6 +2,7 @@
 import json
 import os
 import time
+import traceback
 import warnings
 
 import numpy as np
@@ -201,17 +202,23 @@ def run_FCS_estimate(
     for session in SESSIONS:
         for task in TASKS:
             for run in RUNS[task]:
-                run_FCS_estimate(
-                    params_methods=params_methods,
-                    MEASURES_name_lst=picked_measure_list,
-                    alter_hparams=alter_hparams,
-                    params_multi_analysis=params_multi_analysis,
-                    task=task,
-                    roi_root=roi_root,
-                    output_root=fitted_measures_root,
-                    session=session,
-                    run=run,
-                )
+                try:
+                    run_FCS_estimate(
+                        params_methods=params_methods,
+                        MEASURES_name_lst=picked_measure_list,
+                        alter_hparams=alter_hparams,
+                        params_multi_analysis=params_multi_analysis,
+                        task=task,
+                        roi_root=roi_root,
+                        output_root=fitted_measures_root,
+                        session=session,
+                        run=run,
+                    )
+                except Exception as e:
+                    print(
+                        f"Error in run_FCS_estimate for task: {task}, session: {session}, run: {run}, measure: {picked_measure_list[0]}, error: {e}"
+                    )
+                    traceback.print_exc()
 
     print(
         f"FCS estimation CODE finished running ... for measure: {picked_measure_list[0]} ..."

From 30009c432d09581aba92a90e1a3b106dc08f2791 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 29 Jul 2025 13:34:03 -0400
Subject: [PATCH 239/401] update hyperparams

---
 task_dFC/run_scripts_slurm/methods_config.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index 8ff76f2..056646f 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -1,12 +1,12 @@
 {
     "params_methods" : {
-        "W": 12,
+        "W": 44,
         "n_overlap": 1.0,
         "sw_method": "pear_corr",
         "tapered_window": true,
         "TF_method": "WTC",
         "clstr_base_measure": "SlidingWindow",
-        "clstr_distance": "euclidean",
+        "clstr_distance": "manhattan",
         "hmm_iter": 20,
         "dhmm_obs_state_ratio": 0.666,
         "n_states": 5,

From 5dc21414a308c02d2a2dc1a0f1c3227bbfe3014e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 29 Jul 2025 17:13:56 -0400
Subject: [PATCH 240/401] improve GMM and fix calc_task_duration

---
 pydfc/ml_utils.py   |  3 +-
 pydfc/task_utils.py | 70 +++++++++++++++++++++++++++++----------------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 2ef6c34..e58a06a 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1483,7 +1483,8 @@ def task_presence_classification(
                 n_neighbors_LE=125,
                 LE_embedding_method="embed+procrustes",
             )
-        except:
+        except Exception as e:
+            print(f"Error in embedding dFC features with {embedding}: {e}")
             continue
 
         # Silhouette score
diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 5342be2..e6a8bb3 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -297,10 +297,27 @@ def GMM_binarizing(
 ):
     event_labels_all_task_hrf = event_labels_all_task_hrf.copy()
     event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf.reshape(-1, 1)
-    # Fit GMM
-    gmm = GaussianMixture(n_components=2, n_init=5).fit(
-        event_labels_all_task_hrf_reshaped
+    # normal the signal to [0, 1]
+    event_labels_all_task_hrf_reshaped = (
+        event_labels_all_task_hrf_reshaped - np.min(event_labels_all_task_hrf_reshaped)
+    ) / (
+        np.max(event_labels_all_task_hrf_reshaped)
+        - np.min(event_labels_all_task_hrf_reshaped)
     )
+    # Fit GMM
+    gmm = GaussianMixture(
+        n_components=2, means_init=np.array([[0.0], [1.0]]), n_init=5
+    ).fit(event_labels_all_task_hrf_reshaped)
+    means = gmm.means_.flatten()
+    # if the lower mean is larger than 0.25 or the higher mean is smaller than 0.75, we need to use 3 components
+    # first find the lower and higher mean
+    lower_mean = np.min(means)
+    higher_mean = np.max(means)
+    if lower_mean > 0.25 or higher_mean < 0.75:
+        # Fit GMM with 3 components
+        gmm = GaussianMixture(
+            n_components=3, means_init=np.array([[0.0], [0.5], [1.0]]), n_init=5
+        ).fit(event_labels_all_task_hrf_reshaped)
     # downsample to MRI TR
     if downsample:
         event_labels_all_task_hrf_reshaped = downsample_events_hrf(
@@ -317,30 +334,30 @@ def GMM_binarizing(
     # The "on" state should have a higher mean (HRF-convolved signal is elevated during task).
     means = gmm.means_.flatten()
     on_component = np.argmax(means)
-    # Get probability of being in the "on" state
+    off_component = np.argmin(means)
+    # if len(means) == 3:
+    #     # set the mid component to the one that is in between the on and off components
+    #     mid_component = np.argsort(means)[1]
+    # Get probability of being in the "on" and "off" state
     p_on = probs[:, on_component]
+    p_off = probs[:, off_component]
+    # if len(means) == 3:
+    #     p_mid = probs[:, mid_component]
     # Create a binarized signal with transition points discarded
-    indices = np.where((p_on <= threshold) | (p_on >= (1 - threshold)))[0]
-    task_presence = np.where(p_on >= (1 - threshold), 1, 0)
-
-    # check that both classes are non-empty
-    unique_labels = np.unique(task_presence[indices])
-    if len(unique_labels) < 2:
-        fallback_threshold = 0.10
-        indices = np.where(
-            (p_on <= fallback_threshold) | (p_on >= (1 - fallback_threshold))
-        )[0]
-        task_presence = np.where(p_on >= (1 - fallback_threshold), 1, 0)
-
-        # Re-check after fallback
+    for threshold in [0.01, 0.1, 0.2, 0.3, 0.4]:
+        # try different thresholds
+        # lower thresholds may result in only one class being present
+        indices = np.where((p_off >= (1 - threshold)) | (p_on >= (1 - threshold)))[0]
+        task_presence = np.where(p_on >= (1 - threshold), 1, 0)
+
+        # check that both classes are non-empty
         unique_labels = np.unique(task_presence[indices])
-        if len(unique_labels) < 2:
-            warnings.warn(
-                f"Even with fallback threshold={fallback_threshold}, only one class present in confident samples."
-            )
-        else:
+        if len(unique_labels) == 2:
+            break
+
+        if threshold == 0.4:
             warnings.warn(
-                f"Only one class detected at threshold={threshold}, falling back to threshold={fallback_threshold}."
+                f"Even with threshold={threshold}, only one class present in confident samples."
             )
 
     return task_presence, indices
@@ -456,10 +473,15 @@ def calc_task_duration(task_presence, TR_mri):
     return: avg_task_duration, var_task_duration
     """
     task_durations = list()
+    start = None
     for i in range(1, len(task_presence)):
         if task_presence[i] == 1 and task_presence[i - 1] == 0:
             start = i
-        if task_presence[i] == 0 and task_presence[i - 1] == 1:
+        if (
+            (task_presence[i] == 0)
+            and (task_presence[i - 1] == 1)
+            and (start is not None)
+        ):
             end = i
             task_durations.append((end - start) * TR_mri)
             start = None

From b40215e3cef037e87c73922257f5ad8455a843cf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 29 Jul 2025 17:21:59 -0400
Subject: [PATCH 241/401] minor in GMM

---
 pydfc/task_utils.py | 46 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index e6a8bb3..bfb39d7 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -289,12 +289,42 @@ def shifted_binarizing(
 
 def GMM_binarizing(
     event_labels_all_task_hrf,
-    threshold=0.01,
+    threshold=None,
     downsample=True,
     TR_mri=None,
     TR_task=None,
     TR_array=None,
 ):
+    """_summary_
+
+    Parameters
+    ----------
+    event_labels_all_task_hrf : _type_
+        _description_
+    threshold : float, optional
+        _description_, by default 0.01
+    downsample : bool, optional
+        _description_, by default True
+    TR_mri : _type_, optional
+        _description_, by default None
+    TR_task : _type_, optional
+        _description_, by default None
+    TR_array : _type_, optional
+        _description_, by default None
+
+    Returns
+    -------
+    task_presence : array
+        _description_
+    indices : array
+        _description_
+    -----------
+    in order to get the task presence, use task_presence[indices]
+    """
+    if threshold is None:
+        thresholds_list = [0.01, 0.1, 0.2, 0.3, 0.4]
+    else:
+        thresholds_list = [threshold]
     event_labels_all_task_hrf = event_labels_all_task_hrf.copy()
     event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf.reshape(-1, 1)
     # normal the signal to [0, 1]
@@ -344,20 +374,20 @@ def GMM_binarizing(
     # if len(means) == 3:
     #     p_mid = probs[:, mid_component]
     # Create a binarized signal with transition points discarded
-    for threshold in [0.01, 0.1, 0.2, 0.3, 0.4]:
+    for threshold_ in thresholds_list:
         # try different thresholds
         # lower thresholds may result in only one class being present
-        indices = np.where((p_off >= (1 - threshold)) | (p_on >= (1 - threshold)))[0]
-        task_presence = np.where(p_on >= (1 - threshold), 1, 0)
+        indices = np.where((p_off >= (1 - threshold_)) | (p_on >= (1 - threshold_)))[0]
+        task_presence = np.where(p_on >= (1 - threshold_), 1, 0)
 
         # check that both classes are non-empty
         unique_labels = np.unique(task_presence[indices])
         if len(unique_labels) == 2:
             break
 
-        if threshold == 0.4:
+        if threshold_ == 0.4:
             warnings.warn(
-                f"Even with threshold={threshold}, only one class present in confident samples."
+                f"Even with threshold={threshold_}, only one class present in confident samples."
             )
 
     return task_presence, indices
@@ -434,8 +464,8 @@ def extract_task_presence(
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "GMM":
             task_presence, indices = GMM_binarizing(
-                event_labels_all_task_hrf,
-                threshold=0.01,
+                event_labels_all_task_hrf=event_labels_all_task_hrf,
+                threshold=None,
                 downsample=True,
                 TR_mri=TR_mri,
                 TR_task=TR_task,

From 48ce13b9901a5d4c22df256c0970d3b174989758 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 30 Jul 2025 13:46:02 -0400
Subject: [PATCH 242/401] minor

---
 task_dFC/ML.py                       | 2 +-
 task_dFC/run_scripts_slurm/run_ML.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index ea857f2..bb5c6e2 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -378,7 +378,7 @@ def run_clustering_for_visual(
             output_root=ML_root,
             dynamic_pred="no",
             normalize_dFC=True,
-            n_jobs=6,
+            n_jobs=8,
         )
     except Exception as e:
         print(f"Error in classification for dFC ID {dFC_id}: {e}")
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index 7187c62..fd0632b 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-#SBATCH --job-name=ML_job   # Optional: Name of your job
+#SBATCH --cpus-per-task=8  # Number of CPU cores per task
 #SBATCH --output=logs/ML_out.txt  # Standard output log
 #SBATCH --error=logs/ML_err.txt   # Standard error log
 #SBATCH --mem=128G                     # Memory request per node

From ada54e4407aa7a938d7830db00370801bade5cca Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 30 Jul 2025 16:16:19 -0400
Subject: [PATCH 243/401] fix SI_ID

---
 pydfc/ml_utils.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index e58a06a..d99bd5d 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -612,7 +612,13 @@ def twonn(X, discard_ratio=0.1):
     return d
 
 
-def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125):
+def SI_ID(
+    X,
+    y,
+    search_range=range(2, 50, 5),
+    n_neighbors_LE=125,
+    LE_embedding_method="embed+procrustes",
+):
     """
     Find the intrinsic dimension of the data based on the silhouette score.
     """
@@ -635,9 +641,12 @@ def SI_ID(X, y, search_range=range(2, 50, 5), n_neighbors_LE=125):
                 embedding="LE",
                 n_components=n_components,
                 n_neighbors_LE=n_neighbors_LE,
-                LE_embedding_method="embed+procrustes",
+                LE_embedding_method=LE_embedding_method,
+            )
+        except Exception as e:
+            warnings.warn(
+                f"Error in SI_ID embedding with n_components={n_components}: {e}. Skipping this n_components."
             )
-        except:
             continue
 
         SI_score[n_components] = silhouette_score(X_train_embed, y)
@@ -656,6 +665,7 @@ def find_intrinsic_dim(
     method="SI",
     n_neighbors_LE=125,
     search_range_SI=range(2, 50, 5),
+    LE_embedding_method="embed+procrustes",
 ):
     """
     Find the number of components to use for embedding the data using LE.
@@ -677,6 +687,7 @@ def find_intrinsic_dim(
                     y_subj,
                     search_range=search_range_SI,
                     n_neighbors_LE=n_neighbors_LE,
+                    LE_embedding_method=LE_embedding_method,
                 )
             )
         intrinsic_dim = int(np.mean(intrinsic_dim_all))
@@ -958,6 +969,12 @@ def embed_dFC_features(
                 LE_embedding_method = "concat+embed"
         # if n_components is not specified, find the intrinsic dimension of the data using training set and based on the silhouette score
         if n_components == "auto":
+            if X_train.shape[1] < 7:
+                search_range_SI = range(2, X_train.shape[1] + 1)
+            elif X_train.shape[1] < 24:
+                search_range_SI = range(2, X_train.shape[1] + 1, 2)
+            else:
+                search_range_SI = range(2, 50, 5)
             n_components = find_intrinsic_dim(
                 X=X_train,
                 y=y_train,
@@ -965,7 +982,8 @@ def embed_dFC_features(
                 subjects=train_subjects,
                 method="SI",
                 n_neighbors_LE=n_neighbors_LE,
-                search_range_SI=range(2, 50, 5),
+                search_range_SI=search_range_SI,
+                LE_embedding_method=LE_embedding_method,
             )
 
         if LE_embedding_method == "embed+procrustes":

From 4297309c6c69f04c28c88ba46123b784058bf8b7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 30 Jul 2025 17:53:42 -0400
Subject: [PATCH 244/401] fix SI_ID

---
 pydfc/ml_utils.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index d99bd5d..d46e97a 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -681,15 +681,21 @@ def find_intrinsic_dim(
         for subject in subjects:
             X_subj = X[subj_label == subject, :]
             y_subj = y[subj_label == subject]
-            intrinsic_dim_all.append(
-                SI_ID(
+            try:
+                # some subjects may not have enough samples to estimate the intrinsic dimension
+                subj_estim_ID = SI_ID(
                     X_subj,
                     y_subj,
                     search_range=search_range_SI,
                     n_neighbors_LE=n_neighbors_LE,
                     LE_embedding_method=LE_embedding_method,
                 )
-            )
+                intrinsic_dim_all.append(subj_estim_ID)
+            except Exception as e:
+                warnings.warn(
+                    f"Error in SI_ID for subject {subject}: {e}. Skipping this subject."
+                )
+                continue
         intrinsic_dim = int(np.mean(intrinsic_dim_all))
     elif method == "twonn":
         intrinsic_dim_all = list()
@@ -709,11 +715,17 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
     min_n_neighbors = 70
 
     if n_neighbors >= X.shape[0]:
-        n_neighbors_to_be_used = min_n_neighbors
-        # raise a warning
-        warnings.warn(
-            "n_neighbors is larger than the number of samples. n_neighbors is set to the minimum value of 70."
-        )
+        if min_n_neighbors >= X.shape[0]:
+            n_neighbors_to_be_used = int(X.shape[0] * 2 / 3)
+            warnings.warn(
+                f"number of samples is less than {min_n_neighbors}. n_neighbors is set to {n_neighbors_to_be_used}."
+            )
+        else:
+            n_neighbors_to_be_used = min_n_neighbors
+            # raise a warning
+            warnings.warn(
+                f"n_neighbors is larger than the number of samples. n_neighbors is set to the minimum value of {min_n_neighbors}."
+            )
     else:
         n_neighbors_to_be_used = n_neighbors
 

From c6cb9e6c1e69357f91c285fbc2d146e5646ca733 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 30 Jul 2025 18:24:24 -0400
Subject: [PATCH 245/401] include rs in nifti2roi

---
 task_dFC/nifti_to_roi_signal.py | 91 +++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 37 deletions(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index ec14515..e91eed1 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -38,7 +38,7 @@ def run_roi_signal_extraction(
         else:
             ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/{session}/func/")
     except FileNotFoundError:
-        print(f"Subject {subj} {session_str} not found in {fmriprep_root}")
+        warnings.warn(f"Subject {subj} {session_str} not found in {fmriprep_root}")
         return
 
     ALL_TASK_FILES = [
@@ -49,7 +49,7 @@ def run_roi_signal_extraction(
 
     if not len(ALL_TASK_FILES) >= 1:
         # if the func file is not found, exclude the subject
-        print(f"Func file not found for {subj} {session_str} {task}")
+        warnings.warn(f"Func file not found for {subj} {session_str} {task}")
         return
 
     for run in RUNS:
@@ -102,9 +102,11 @@ def run_roi_signal_extraction(
         if global_acquisition_data is None and acquisition_data is None:
             # if the acquisition_data is not found, exclude the subject
             if run is None:
-                print(f"bold.json info file not found for {subj} {session_str} {task}")
+                warnings.warn(
+                    f"bold.json info file not found for {subj} {session_str} {task}"
+                )
             else:
-                print(
+                warnings.warn(
                     f"bold.json info file not found for {subj} {session_str} {task} {run}"
                 )
             return
@@ -121,9 +123,13 @@ def run_roi_signal_extraction(
         # if not found, print a warning and skip the subject
         if TR_mri is None:
             if run is None:
-                print(f"Repetition time not found for {subj} {session_str} {task}")
+                warnings.warn(
+                    f"Repetition time not found for {subj} {session_str} {task}"
+                )
             else:
-                print(f"Repetition time not found for {subj} {session_str} {task} {run}")
+                warnings.warn(
+                    f"Repetition time not found for {subj} {session_str} {task} {run}"
+                )
             return
         ################################# EXTRACT TIME SERIES #########################
         # extract ROI signals and convert to TIME_SERIES object
@@ -158,6 +164,7 @@ def run_roi_signal_extraction(
                 file_i for file_i in ALL_EVENTS_FILES if f"_{session}_" in file_i
             ]
 
+        events_file_exists = True
         if not len(ALL_EVENTS_FILES) == 1:
             # in some cases the event file is common for all subjects and can be found in f"{bids_root}"
             ALL_EVENTS_FILES_COMMON = os.listdir(f"{bids_root}/")
@@ -169,43 +176,52 @@ def run_roi_signal_extraction(
             if len(ALL_EVENTS_FILES_COMMON) == 1:
                 events_file = f"{bids_root}/{ALL_EVENTS_FILES_COMMON[0]}"
             else:
-                # if the events file is not found, exclude the subject
+                # if the events file is not found, do not exclude the subject, only save time-series data
+                # this will allow including resting state files
                 if run is None:
-                    print(f"Events file not found for {subj} {session_str} {task}")
+                    warnings.warn(
+                        f"Events file not found for {subj} {session_str} {task}"
+                    )
                 else:
-                    print(f"Events file not found for {subj} {session_str} {task} {run}")
-                return
+                    warnings.warn(
+                        f"Events file not found for {subj} {session_str} {task} {run}"
+                    )
+                events_file_exists = False
         else:
             events_file = f"{task_events_root}/{ALL_EVENTS_FILES[0]}"
 
-        # load the tsv events file
-        events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
-        # get the event labels
-        event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
-            events=events,
-            TR_mri=TR_mri,
-            num_time_mri=num_time_mri,
-            event_types=None,
-            oversampling=oversampling,
-            trial_type_label=trial_type_label,
-            rest_labels=rest_labels,
-            return_0_1=False,
-        )
-        # fill task labels with task's index
-        task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(task)
+        if events_file_exists:
+            # load the tsv events file
+            events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
+            # get the event labels
+            event_labels, Fs_task, event_types = task_utils.events_time_to_labels(
+                events=events,
+                TR_mri=TR_mri,
+                num_time_mri=num_time_mri,
+                event_types=None,
+                oversampling=oversampling,
+                trial_type_label=trial_type_label,
+                rest_labels=rest_labels,
+                return_0_1=False,
+            )
+            # fill task labels with task's index
+            task_labels = np.ones((int(num_time_mri * oversampling), 1)) * TASKS.index(
+                task
+            )
         ################################# SAVE #################################
         # save the ROI time series and task data
-        task_data = {
-            "task": task,
-            "task_labels": task_labels,
-            "task_types": TASKS,
-            "event_labels": event_labels,
-            "event_types": event_types,
-            "events": events,
-            "Fs_task": Fs_task,
-            "TR_mri": TR_mri,
-            "num_time_mri": num_time_mri,
-        }
+        if events_file_exists:
+            task_data = {
+                "task": task,
+                "task_labels": task_labels,
+                "task_types": TASKS,
+                "event_labels": event_labels,
+                "event_types": event_types,
+                "events": events,
+                "Fs_task": Fs_task,
+                "TR_mri": TR_mri,
+                "num_time_mri": num_time_mri,
+            }
 
         if session is None:
             subj_session_prefix = f"{subj}"
@@ -222,7 +238,8 @@ def run_roi_signal_extraction(
         if not os.path.exists(f"{output_dir}/"):
             os.makedirs(f"{output_dir}/")
         np.save(f"{output_dir}/{output_file_prefix}_time-series.npy", time_series)
-        np.save(f"{output_dir}/{output_file_prefix}_task-data.npy", task_data)
+        if events_file_exists:
+            np.save(f"{output_dir}/{output_file_prefix}_task-data.npy", task_data)
 
 
 ########################################################################################

From c1b2b5d571b8127c1e848f9c76f85c72393fdd1d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 7 Aug 2025 14:29:38 -0400
Subject: [PATCH 246/401] remove run_clustering_for_visual and
 run_task_features_extraction from ML

---
 task_dFC/ML.py | 60 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index bb5c6e2..e015bcd 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -345,20 +345,20 @@ def run_clustering_for_visual(
     else:
         ML_root = dataset_info["ML_root"]
 
-    # The task feature extraction will be executed multiple times in parallel redundantly
-    try:
-        run_task_features_extraction(
-            TASKS=TASKS,
-            RUNS=RUNS,
-            SESSIONS=SESSIONS,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            output_root=ML_root,
-        )
-    except Exception as e:
-        print(f"Error in task features extraction: {e}")
-        traceback.print_exc()
-    print("Task features extraction finished.")
+    # # The task feature extraction will be executed multiple times in parallel redundantly
+    # try:
+    #     run_task_features_extraction(
+    #         TASKS=TASKS,
+    #         RUNS=RUNS,
+    #         SESSIONS=SESSIONS,
+    #         roi_root=roi_root,
+    #         dFC_root=dFC_root,
+    #         output_root=ML_root,
+    #     )
+    # except Exception as e:
+    #     print(f"Error in task features extraction: {e}")
+    #     traceback.print_exc()
+    # print("Task features extraction finished.")
 
     job_id = os.getenv("SGE_TASK_ID")  # for SGE
     if job_id is None:
@@ -402,23 +402,23 @@ def run_clustering_for_visual(
 
     # print(f"Task presence clustering finished for dFC ID {dFC_id}.")
 
-    print(f"Clustering for visualization started for dFC ID {dFC_id} ...")
-    try:
-        run_clustering_for_visual(
-            dFC_id=dFC_id,
-            TASKS=TASKS,
-            RUNS=RUNS,
-            SESSIONS=SESSIONS,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            output_root=ML_root,
-            normalize_dFC=True,
-        )
-    except Exception as e:
-        print(f"Error in clustering for visualization for dFC ID {dFC_id}: {e}")
-        traceback.print_exc()
+    # print(f"Clustering for visualization started for dFC ID {dFC_id} ...")
+    # try:
+    #     run_clustering_for_visual(
+    #         dFC_id=dFC_id,
+    #         TASKS=TASKS,
+    #         RUNS=RUNS,
+    #         SESSIONS=SESSIONS,
+    #         roi_root=roi_root,
+    #         dFC_root=dFC_root,
+    #         output_root=ML_root,
+    #         normalize_dFC=True,
+    #     )
+    # except Exception as e:
+    #     print(f"Error in clustering for visualization for dFC ID {dFC_id}: {e}")
+    #     traceback.print_exc()
 
-    print(f"Clustering for visualization finished for dFC ID {dFC_id}.")
+    # print(f"Clustering for visualization finished for dFC ID {dFC_id}.")
 
     print(f"Task presence prediction finished for dFC ID {dFC_id}.")
 

From 4d4c4d8697cb451af895b791158b68c2c74b48f6 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 21 Aug 2025 19:25:54 -0400
Subject: [PATCH 247/401] optimize ML code

---
 pydfc/ml_utils.py | 85 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 68 insertions(+), 17 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index d46e97a..fdff07c 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -305,6 +305,8 @@ def dFC_feature_extraction_subj_lvl(
         features = features[2:-2, :]
         target = target[2:-2]
 
+    features = features.astype(np.float32, copy=False)
+    target = target.astype(np.int8, copy=False)  # labels smaller & faster
     return features, target
 
 
@@ -355,6 +357,10 @@ def dFC_feature_extraction(
             FCS_proba_for_SB=FCS_proba_for_SB,
         )
 
+        # to make computations faster
+        X_subj = X_subj.astype(np.float32, copy=False)
+        y_subj = y_subj.astype(np.int8, copy=False)
+
         subj_label_train.extend([subj for i in range(X_subj.shape[0])])
         if X_train is None and y_train is None:
             X_train = X_subj
@@ -395,10 +401,14 @@ def dFC_feature_extraction(
             FCS_proba_for_SB=FCS_proba_for_SB,
         )
 
+        # to make computations faster
+        X_subj = X_subj.astype(np.float32, copy=False)
+        y_subj = y_subj.astype(np.int8, copy=False)
+
         subj_label_test.extend([subj for i in range(X_subj.shape[0])])
         if X_test is None and y_test is None:
-            X_test = X_subj
-            y_test = y_subj
+            X_test = X_subj.astype(np.float32, copy=False)
+            y_test = y_subj.astype(np.int8, copy=False)
         else:
             X_test = np.concatenate((X_test, X_subj), axis=0)
             y_test = np.concatenate((y_test, y_subj), axis=0)
@@ -459,7 +469,7 @@ def precheck_for_procruste(X_best, X_subj):
     return X_best_new
 
 
-def generalized_procrustes(X_embed_dict):
+def generalized_procrustes(X_embed_dict, max_iter=1000, tol=1e-6):
     """
     Generalized Procrustes Analysis
 
@@ -499,7 +509,7 @@ def generalized_procrustes(X_embed_dict):
         X_list.append(X_scan_embed_new)
 
     # now iteratively find the mean X for transform
-    for iter_num in range(100):
+    for _ in range(10):
 
         try:
             # initialize Procrustes distance
@@ -518,7 +528,7 @@ def generalized_procrustes(X_embed_dict):
             flag = False
             while True:
                 counter += 1
-                if counter > 1e6:
+                if counter > max_iter:
                     # if the algorithm does not converge, break the cycle
                     # to avoid infinite loop
                     flag = True
@@ -538,7 +548,7 @@ def generalized_procrustes(X_embed_dict):
                 _, _, new_distance = procrustes(new_mean, mean_X)
 
                 # if the distance did not change, break the cycle
-                if np.abs(new_distance - current_distance) < 1e-6:
+                if np.abs(new_distance - current_distance) < tol:
                     break
 
                 # align the new_mean to old mean
@@ -549,6 +559,7 @@ def generalized_procrustes(X_embed_dict):
                 current_distance = new_distance
 
             if not flag:
+                # if the algorithm converged, return the mean X
                 return mean_X
         except:
             continue
@@ -618,6 +629,7 @@ def SI_ID(
     search_range=range(2, 50, 5),
     n_neighbors_LE=125,
     LE_embedding_method="embed+procrustes",
+    measure_is_state_based=False,
 ):
     """
     Find the intrinsic dimension of the data based on the silhouette score.
@@ -642,6 +654,7 @@ def SI_ID(
                 n_components=n_components,
                 n_neighbors_LE=n_neighbors_LE,
                 LE_embedding_method=LE_embedding_method,
+                measure_is_state_based=measure_is_state_based,
             )
         except Exception as e:
             warnings.warn(
@@ -666,6 +679,7 @@ def find_intrinsic_dim(
     n_neighbors_LE=125,
     search_range_SI=range(2, 50, 5),
     LE_embedding_method="embed+procrustes",
+    measure_is_state_based=False,
 ):
     """
     Find the number of components to use for embedding the data using LE.
@@ -689,6 +703,7 @@ def find_intrinsic_dim(
                     search_range=search_range_SI,
                     n_neighbors_LE=n_neighbors_LE,
                     LE_embedding_method=LE_embedding_method,
+                    measure_is_state_based=measure_is_state_based,
                 )
                 intrinsic_dim_all.append(subj_estim_ID)
             except Exception as e:
@@ -736,8 +751,10 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
         include_self=False,
         metric=distance_metric,
     )
-    affinity_matrix = affinity_matrix.toarray()
-    affinity_matrix = np.divide(affinity_matrix + affinity_matrix.T, 2)
+
+    # Symmetrize
+    affinity_matrix = affinity_matrix.maximum(affinity_matrix.T)
+
     LE = SpectralEmbedding(
         n_components=n_components,
         affinity="precomputed",
@@ -931,6 +948,19 @@ def LE_embed_procustes(
     return X_train_embed, X_test_embed
 
 
+def rows_look_redundant(X, sample=100):
+    n = X.shape[0]
+    if n > sample:
+        idx = np.random.choice(n, sample, replace=False)
+        Xs = X[idx]
+    else:
+        Xs = X
+    # Hash rows quickly
+    h = np.apply_along_axis(lambda r: hash(r.tobytes()), 1, Xs)
+    # If more than, say, 50% duplicates -> likely state-based
+    return (len(h) - len(set(h))) / len(h) > 0.5
+
+
 def embed_dFC_features(
     train_subjects,
     test_subjects,
@@ -944,6 +974,7 @@ def embed_dFC_features(
     n_components="auto",
     n_neighbors_LE=125,
     LE_embedding_method="embed+procrustes",
+    measure_is_state_based=False,
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
@@ -973,7 +1004,7 @@ def embed_dFC_features(
             X_test_embed = None
     elif embedding == "LE":
         # if the dFC features are not unique (state-based), set the LE_embedding_method to "concat+embed"
-        if np.unique(X_train, axis=0).shape[0] < X_train.shape[0] // 2:
+        if measure_is_state_based:
             if LE_embedding_method == "embed+procrustes":
                 warnings.warn(
                     "The dFC features are not unique (state-based). Switching to 'concat+embed' method."
@@ -996,6 +1027,7 @@ def embed_dFC_features(
                 n_neighbors_LE=n_neighbors_LE,
                 search_range_SI=search_range_SI,
                 LE_embedding_method=LE_embedding_method,
+                measure_is_state_based=measure_is_state_based,
             )
 
         if LE_embedding_method == "embed+procrustes":
@@ -1033,7 +1065,10 @@ def embed_dFC_features(
             else:
                 X_test_embed = None
 
-    return X_train_embed, X_test_embed
+    # to make computation faster, we can return the embeddings as float32
+    return X_train_embed.astype(np.float32, copy=False), X_test_embed.astype(
+        np.float32, copy=False
+    )
 
 
 ################################# Classification Framework Functions ####################################
@@ -1085,21 +1120,22 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test, subj_label_tr
     """
     # create a pipeline with a logistic regression model to find the best C
     logistic_reg = make_pipeline(
-        StandardScaler(), LogisticRegression(penalty="l1", solver="saga")
+        StandardScaler(),
+        LogisticRegression(penalty="l1", solver="saga", max_iter=2000, tol=1e-3),
     )
     # create a dictionary of all values we want to test for C
-    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
+    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100]}
 
     # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
     # shuffle the data to ensure time points are shuffled
     if subj_label_train is None:
         X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
-        cv = StratifiedKFold(n_splits=5)
+        cv = StratifiedKFold(n_splits=3)
     else:
         X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
             X_train, y_train, subj_label_train
         )
-        cv = StratifiedGroupKFold(n_splits=5)
+        cv = StratifiedGroupKFold(n_splits=3)
     # use gridsearch to test all values for C
     lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=cv, n_jobs=-1)
     # fit model to data
@@ -1113,7 +1149,7 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test, subj_label_tr
 
     model = make_pipeline(
         StandardScaler(),
-        LogisticRegression(penalty="l1", C=C, solver="saga"),
+        LogisticRegression(penalty="l1", C=C, solver="saga", max_iter=2000, tol=1e-3),
     )
 
     RESULT = get_classification_results(
@@ -1136,8 +1172,8 @@ def SVM_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
     """
     # define the parameter grid
     param_grid = {
-        "svc__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        "svc__gamma": [0.0001, 0.001, 0.01, 1, 10, 100, 1000],
+        "svc__C": [0.01, 0.1, 1, 10],
+        "svc__gamma": ["scale", 0.01, 0.05, 0.1],
     }
 
     # perform grid search
@@ -1476,6 +1512,20 @@ def task_presence_classification(
             FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
         )
     )
+    measure_is_state_based = None
+    if measure_name in ["SlidingWindow", "Time-Freq"]:
+        measure_is_state_based = False
+    elif measure_name in [
+        "CAP",
+        "Clustering",
+        "ContinuousHMM",
+        "DiscreteHMM",
+        "Windowless",
+    ]:
+        measure_is_state_based = True
+    else:
+        # raise error
+        raise ValueError(f"Unknown measure name: {measure_name}")
 
     ML_scores = {
         "group_lvl": {
@@ -1512,6 +1562,7 @@ def task_presence_classification(
                 n_components="auto",
                 n_neighbors_LE=125,
                 LE_embedding_method="embed+procrustes",
+                measure_is_state_based=measure_is_state_based,
             )
         except Exception as e:
             print(f"Error in embedding dFC features with {embedding}: {e}")

From 08e07d64c2bd7b42f5d554d06ee083374e6d6a08 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 22 Aug 2025 20:38:44 -0400
Subject: [PATCH 248/401] remove permut tests from ML

---
 pydfc/ml_utils.py | 82 +++++++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index fdff07c..c9695e5 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1604,42 +1604,42 @@ def task_presence_classification(
 
         ML_models = {"Logistic regression": log_reg_RESULT, "SVM": SVM_RESULT}
 
-        # permutation tests
-        permutation_scores = {
-            "train": {},
-            "test": {},
-        }
-        for model_name in ML_models:
-            (
-                permutation_train_scores,
-                permutation_test_scores,
-                p_value_train,
-                p_value_test,
-            ) = get_permutation_scores(
-                X_train=X_train_embedded,
-                y_train=y_train,
-                X_test=X_test_embedded,
-                y_test=y_test,
-                classifier_model=ML_models[model_name]["model"],
-                groups_train=subj_label_train,
-                n_permutations=100,
-            )
-            permutation_scores["train"][
-                f"{model_name} permutation p_value"
-            ] = p_value_train
-            permutation_scores["train"][f"{model_name} permutation score mean"] = np.mean(
-                permutation_train_scores
-            )
-            permutation_scores["train"][f"{model_name} permutation score std"] = np.std(
-                permutation_train_scores
-            )
-            permutation_scores["test"][f"{model_name} permutation p_value"] = p_value_test
-            permutation_scores["test"][f"{model_name} permutation score mean"] = np.mean(
-                permutation_test_scores
-            )
-            permutation_scores["test"][f"{model_name} permutation score std"] = np.std(
-                permutation_test_scores
-            )
+        # # permutation tests
+        # permutation_scores = {
+        #     "train": {},
+        #     "test": {},
+        # }
+        # for model_name in ML_models:
+        #     (
+        #         permutation_train_scores,
+        #         permutation_test_scores,
+        #         p_value_train,
+        #         p_value_test,
+        #     ) = get_permutation_scores(
+        #         X_train=X_train_embedded,
+        #         y_train=y_train,
+        #         X_test=X_test_embedded,
+        #         y_test=y_test,
+        #         classifier_model=ML_models[model_name]["model"],
+        #         groups_train=subj_label_train,
+        #         n_permutations=100,
+        #     )
+        #     permutation_scores["train"][
+        #         f"{model_name} permutation p_value"
+        #     ] = p_value_train
+        #     permutation_scores["train"][f"{model_name} permutation score mean"] = np.mean(
+        #         permutation_train_scores
+        #     )
+        #     permutation_scores["train"][f"{model_name} permutation score std"] = np.std(
+        #         permutation_train_scores
+        #     )
+        #     permutation_scores["test"][f"{model_name} permutation p_value"] = p_value_test
+        #     permutation_scores["test"][f"{model_name} permutation score mean"] = np.mean(
+        #         permutation_test_scores
+        #     )
+        #     permutation_scores["test"][f"{model_name} permutation score std"] = np.std(
+        #         permutation_test_scores
+        #     )
 
         # group level scores
         for group in ["train", "test"]:
@@ -1661,11 +1661,11 @@ def task_presence_classification(
                         ML_models[model_name][group][metric]
                     )
 
-            # permutation test results
-            for key in permutation_scores[group]:
-                if not key in ML_scores["group_lvl"]:
-                    ML_scores["group_lvl"][key] = list()
-                ML_scores["group_lvl"][key].append(permutation_scores[group][key])
+            # # permutation test results
+            # for key in permutation_scores[group]:
+            #     if not key in ML_scores["group_lvl"]:
+            #         ML_scores["group_lvl"][key] = list()
+            #     ML_scores["group_lvl"][key].append(permutation_scores[group][key])
 
         # subject level scores
         for subj in SUBJECTS:

From e8bae527d4f1f9c6b39730a37bcdde23028ef445 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 23 Aug 2025 10:45:09 -0400
Subject: [PATCH 249/401] fix bug

---
 pydfc/ml_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index c9695e5..7f10d78 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1066,9 +1066,10 @@ def embed_dFC_features(
                 X_test_embed = None
 
     # to make computation faster, we can return the embeddings as float32
-    return X_train_embed.astype(np.float32, copy=False), X_test_embed.astype(
-        np.float32, copy=False
-    )
+    X_train_embed = X_train_embed.astype(np.float32, copy=False)
+    if X_test_embed is not None:
+        X_test_embed = X_test_embed.astype(np.float32, copy=False)
+    return X_train_embed, X_test_embed
 
 
 ################################# Classification Framework Functions ####################################

From 19cab30faa77c986731f6b87891368efb5e6298c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 24 Aug 2025 14:11:56 -0400
Subject: [PATCH 250/401] fix error when num classes = 1

---
 pydfc/ml_utils.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 7f10d78..af47ae0 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1547,6 +1547,9 @@ def task_presence_classification(
             "embedding": list(),
         },
     }
+
+    check_count = 2
+    num_excluded_subjects = 0
     for embedding in ["PCA", "LE"]:
         # embed dFC features
         try:
@@ -1569,6 +1572,14 @@ def task_presence_classification(
             print(f"Error in embedding dFC features with {embedding}: {e}")
             continue
 
+        # check if both classes are present in train and test sets
+        if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:
+            print(
+                f"Only one class present in train or test sets for {embedding}. Skipping..."
+            )
+            check_count -= 1
+            continue
+
         # Silhouette score
         # SI does not need to be separated for train and test sets
         # we will use the same SI for both train and test sets
@@ -1670,15 +1681,20 @@ def task_presence_classification(
 
         # subject level scores
         for subj in SUBJECTS:
-            ML_scores["subj_lvl"]["subj_id"].append(subj)
             if subj in train_subjects:
-                ML_scores["subj_lvl"]["group"].append("train")
+                subj_group = "train"
                 features = X_train_embedded[subj_label_train == subj, :]
                 target = y_train[subj_label_train == subj]
             elif subj in test_subjects:
-                ML_scores["subj_lvl"]["group"].append("test")
+                subj_group = "test"
                 features = X_test_embedded[subj_label_test == subj, :]
                 target = y_test[subj_label_test == subj]
+            # check if only one class is present, skip the subject
+            if len(np.unique(target)) < 2:
+                num_excluded_subjects += 1
+                continue
+            ML_scores["subj_lvl"]["group"].append(subj_group)
+            ML_scores["subj_lvl"]["subj_id"].append(subj)
 
             # Silhouette score
             ML_scores["subj_lvl"]["SI"].append(silhouette_score(features, target))
@@ -1709,7 +1725,9 @@ def task_presence_classification(
             ), f"Length of {key} is not equal to others."
 
     # L is supposed to be equal to 2 embeddings (PCA and LE) * 2 groups (train and test)
-    assert L == 2 * 2, f"Length of group_lvl is not equal to 4, but {L}."
+    assert (
+        L == check_count * 2
+    ), f"Length of group_lvl is not equal to {check_count * 2}, but {L}."
 
     L = None
     for key in ML_scores["subj_lvl"]:
@@ -1722,8 +1740,8 @@ def task_presence_classification(
 
     # L is supposed to be equal to number of subjects * 2 embeddings (PCA and LE)
     assert (
-        L == len(SUBJECTS) * 2
-    ), f"Length of subj_lvl is not equal to {len(SUBJECTS) * 2}, but {L}."
+        L == len(SUBJECTS) * check_count - num_excluded_subjects
+    ), f"Length of subj_lvl is not equal to {len(SUBJECTS) * check_count - num_excluded_subjects}, but {L}."
 
     return ML_scores
 

From 2f2c3f148b6abf28514df16144c8a830e91e23ce Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 29 Aug 2025 17:30:51 -0400
Subject: [PATCH 251/401] minor check_count -= 1

---
 pydfc/ml_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index af47ae0..d9fc751 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1570,6 +1570,7 @@ def task_presence_classification(
             )
         except Exception as e:
             print(f"Error in embedding dFC features with {embedding}: {e}")
+            check_count -= 1
             continue
 
         # check if both classes are present in train and test sets

From 0fa5133ae782fe50f410699f5f317702e8e58c75 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 30 Aug 2025 22:25:49 -0400
Subject: [PATCH 252/401] add process_SB_features with ILR

---
 pydfc/ml_utils.py | 109 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 88 insertions(+), 21 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index d9fc751..653b259 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1430,6 +1430,69 @@ def get_permutation_scores(
     return permutation_train_scores, permutation_test_scores, p_value_train, p_value_test
 
 
+def softmax(x, tau=1.0, axis=1):
+    z = (x - np.max(x, axis=axis, keepdims=True)) / float(tau)
+    np.exp(z, out=z)
+    z_sum = np.sum(z, axis=axis, keepdims=True)
+    z /= z_sum
+    return z
+
+
+def clip_and_renorm(P, eps=1e-6, axis=1):
+    P = np.asarray(P, float)
+    P = np.clip(P, eps, None)
+    P /= P.sum(axis=axis, keepdims=True)
+    return P
+
+
+# ---- log-ratio transforms ----
+def clr_transform(P, eps=1e-6):
+    """Centered log-ratio: log(p) - mean(log(p)) row-wise."""
+    P = clip_and_renorm(P, eps=eps)
+    L = np.log(P)
+    return L - L.mean(axis=1, keepdims=True)  # each row sums to 0
+
+
+def ilr_transform(P, eps=1e-6):
+    """Pivot ILR using an orthonormal basis; returns (n, K-1)."""
+    P = clip_and_renorm(P, eps=eps)
+    L = np.log(P)
+    clr = L - L.mean(axis=1, keepdims=True)
+    K = P.shape[1]
+    V = np.zeros((K, K - 1))
+    # Pivot coordinates basis (orthonormal in Aitchison geometry)
+    for j in range(1, K):
+        V[:j, j - 1] = 1 / j
+        V[j, j - 1] = -1
+        V[:, j - 1] *= np.sqrt(j / (j + 1))
+    return clr @ V  # (n, K-1)
+
+
+def process_SB_features(X, measure_name):
+    """
+    Process state-based features for a given measure.
+
+    The process involves applying a softmax function followed by an ILR transform.
+    This is to ensure that the features are properly normalized and transformed for subsequent analysis.
+
+    State-based feature vectors are compositional (non-negative and sum-to-one). We therefore analyze
+    them in the Aitchison geometry and apply the isometric log-ratio (ILR) transformation (K−1 coordinates).
+    The output has K−1 dimensions.
+    """
+    tau = 1.0  # temperature; 0.5–2.0 is typical
+
+    X_transformed = None
+    if measure_name in ["CAP", "Clustering"]:
+        X_transformed = softmax(-X, tau=tau)
+        # 2) ILR transform
+        X_transformed = ilr_transform(X_transformed)
+    elif measure_name in ["ContinuousHMM", "DiscreteHMM"]:
+        X_transformed = ilr_transform(X)
+    elif measure_name in ["Windowless"]:
+        X_transformed = X.copy()
+    return X_transformed
+
+
 def get_classification_scores(
     target,
     pred,
@@ -1551,27 +1614,31 @@ def task_presence_classification(
     check_count = 2
     num_excluded_subjects = 0
     for embedding in ["PCA", "LE"]:
-        # embed dFC features
-        try:
-            X_train_embedded, X_test_embedded = embed_dFC_features(
-                train_subjects=train_subjects,
-                test_subjects=test_subjects,
-                X_train=X_train,
-                X_test=X_test,
-                y_train=y_train,
-                y_test=y_test,
-                subj_label_train=subj_label_train,
-                subj_label_test=subj_label_test,
-                embedding=embedding,
-                n_components="auto",
-                n_neighbors_LE=125,
-                LE_embedding_method="embed+procrustes",
-                measure_is_state_based=measure_is_state_based,
-            )
-        except Exception as e:
-            print(f"Error in embedding dFC features with {embedding}: {e}")
-            check_count -= 1
-            continue
+        if measure_is_state_based:
+            X_train_embedded = process_SB_features(X=X_train, measure_name=measure_name)
+            X_test_embedded = process_SB_features(X=X_test, measure_name=measure_name)
+        else:
+            # embed dFC features
+            try:
+                X_train_embedded, X_test_embedded = embed_dFC_features(
+                    train_subjects=train_subjects,
+                    test_subjects=test_subjects,
+                    X_train=X_train,
+                    X_test=X_test,
+                    y_train=y_train,
+                    y_test=y_test,
+                    subj_label_train=subj_label_train,
+                    subj_label_test=subj_label_test,
+                    embedding=embedding,
+                    n_components="auto",
+                    n_neighbors_LE=125,
+                    LE_embedding_method="embed+procrustes",
+                    measure_is_state_based=measure_is_state_based,
+                )
+            except Exception as e:
+                print(f"Error in embedding dFC features with {embedding}: {e}")
+                check_count -= 1
+                continue
 
         # check if both classes are present in train and test sets
         if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:

From 3e0a0854eaa9468482615ae3c5e9af08dbf7b2d7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 1 Sep 2025 12:33:29 -0400
Subject: [PATCH 253/401] ILR on WL

---
 pydfc/ml_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 653b259..3beedaf 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1486,10 +1486,8 @@ def process_SB_features(X, measure_name):
         X_transformed = softmax(-X, tau=tau)
         # 2) ILR transform
         X_transformed = ilr_transform(X_transformed)
-    elif measure_name in ["ContinuousHMM", "DiscreteHMM"]:
+    elif measure_name in ["ContinuousHMM", "DiscreteHMM", "Windowless"]:
         X_transformed = ilr_transform(X)
-    elif measure_name in ["Windowless"]:
-        X_transformed = X.copy()
     return X_transformed
 
 

From 023a4361f5db7a84f50814909d985f1ae1c231b2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 9 Sep 2025 20:14:06 -0400
Subject: [PATCH 254/401] add cohen d

---
 pydfc/task_utils.py | 47 +++++++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index bfb39d7..adfde63 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -148,24 +148,47 @@ def plot_task_dFC(task_presence, dFC_lst, Fs_mri, TR_step=12):
     plt.show()
 
 
-################################# PCA Functions ####################################
+################################# Stat Functions ####################################
 
-# def BOLD
 
+def cohen_d_bold(X, y):
+    """
+    Compute Cohen's d per ROI between task and rest.
 
-################################# Prediction Functions ####################################
+    Parameters
+    ----------
+    X : ndarray, shape (n_timepoints, n_ROIs)
+        BOLD signals.
+    y : ndarray, shape (n_timepoints,)
+        Task labels: 0 = rest, 1 = task.
 
-from sklearn.linear_model import LinearRegression
+    Returns
+    -------
+    d_values : ndarray, shape (n_ROIs,)
+        Cohen's d per ROI.
+    """
+    task_idx = y == 1
+    rest_idx = y == 0
 
+    X_task = X[task_idx, :]
+    X_rest = X[rest_idx, :]
 
-def linear_reg(X, y):
-    """
-    X = (n_samples, n_features)
-    y = (n_samples, n_targets)
-    """
-    reg = LinearRegression().fit(X, y)
-    print(reg.score(X, y))
-    return reg.predict(X)
+    mean_task = X_task.mean(axis=0)
+    mean_rest = X_rest.mean(axis=0)
+
+    std_task = X_task.std(axis=0, ddof=1)
+    std_rest = X_rest.std(axis=0, ddof=1)
+
+    n_task = X_task.shape[0]
+    n_rest = X_rest.shape[0]
+
+    pooled_std = np.sqrt(
+        ((n_task - 1) * std_task**2 + (n_rest - 1) * std_rest**2) / (n_task + n_rest - 2)
+    )
+
+    d_values = (mean_task - mean_rest) / pooled_std
+
+    return d_values
 
 
 ################################# Validation Functions ####################################

From dc9716c6b411904210ae44bd3e1de91230586594 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Sep 2025 12:04:04 -0400
Subject: [PATCH 255/401] correct TR_array in task_presence

---
 pydfc/task_utils.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index adfde63..71a30ec 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -378,7 +378,10 @@ def GMM_binarizing(
         )
     # some dFC measures (window-based) have a different TR than the task data
     if TR_array is not None:
-        event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf_reshaped[TR_array]
+        TR_array_corrected = TR_array[TR_array < len(event_labels_all_task_hrf_reshaped)]
+        event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf_reshaped[
+            TR_array_corrected
+        ]
     # now predict on vs. off for the downsampled time points
     probs = gmm.predict_proba(event_labels_all_task_hrf_reshaped)
     # Identify which component corresponds to "on" (higher mean)
@@ -456,22 +459,27 @@ def extract_task_presence(
         # other tasks
         event_labels_all_task_hrf = event_labels_all_task_hrf[:, 1]
 
+    # NOTE that index 0 of task_presence corresponds to 0 sec, but not TR_array
+    TR_array_corrected = None
+    if TR_array is not None:
+        TR_array_corrected = TR_array.astype(int) + 1
+
     if binary:
         if binarizing_method == "median":
             threshold = np.median(event_labels_all_task_hrf)
             task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
             task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
             # some dFC measures (window-based) have a different TR than the task data
-            if TR_array is not None:
-                task_presence = task_presence[TR_array]
+            if TR_array_corrected is not None:
+                task_presence = task_presence[TR_array_corrected]
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "mean":
             threshold = np.mean(event_labels_all_task_hrf)
             task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
             task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
             # some dFC measures (window-based) have a different TR than the task data
-            if TR_array is not None:
-                task_presence = task_presence[TR_array]
+            if TR_array_corrected is not None:
+                task_presence = task_presence[TR_array_corrected]
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "shift":
             task_presence_ratio = np.mean(event_labels_all_task)
@@ -482,8 +490,8 @@ def extract_task_presence(
             task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
             task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
             # some dFC measures (window-based) have a different TR than the task data
-            if TR_array is not None:
-                task_presence = task_presence[TR_array]
+            if TR_array_corrected is not None:
+                task_presence = task_presence[TR_array_corrected]
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "GMM":
             task_presence, indices = GMM_binarizing(
@@ -492,7 +500,7 @@ def extract_task_presence(
                 downsample=True,
                 TR_mri=TR_mri,
                 TR_task=TR_task,
-                TR_array=TR_array,
+                TR_array=TR_array_corrected,
             )
         else:
             raise ValueError(
@@ -502,8 +510,8 @@ def extract_task_presence(
         task_presence = event_labels_all_task_hrf
         task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
         # some dFC measures (window-based) have a different TR than the task data
-        if TR_array is not None:
-            task_presence = task_presence[TR_array]
+        if TR_array_corrected is not None:
+            task_presence = task_presence[TR_array_corrected]
         indices = np.arange(task_presence.shape[0])
 
     return task_presence, indices

From 7fbdd157e9959beef0daa8eaa3e12f04844026d3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Sep 2025 19:48:45 -0400
Subject: [PATCH 256/401] change back TR_array in task_presence

---
 pydfc/task_utils.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 71a30ec..adfde63 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -378,10 +378,7 @@ def GMM_binarizing(
         )
     # some dFC measures (window-based) have a different TR than the task data
     if TR_array is not None:
-        TR_array_corrected = TR_array[TR_array < len(event_labels_all_task_hrf_reshaped)]
-        event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf_reshaped[
-            TR_array_corrected
-        ]
+        event_labels_all_task_hrf_reshaped = event_labels_all_task_hrf_reshaped[TR_array]
     # now predict on vs. off for the downsampled time points
     probs = gmm.predict_proba(event_labels_all_task_hrf_reshaped)
     # Identify which component corresponds to "on" (higher mean)
@@ -459,27 +456,22 @@ def extract_task_presence(
         # other tasks
         event_labels_all_task_hrf = event_labels_all_task_hrf[:, 1]
 
-    # NOTE that index 0 of task_presence corresponds to 0 sec, but not TR_array
-    TR_array_corrected = None
-    if TR_array is not None:
-        TR_array_corrected = TR_array.astype(int) + 1
-
     if binary:
         if binarizing_method == "median":
             threshold = np.median(event_labels_all_task_hrf)
             task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
             task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
             # some dFC measures (window-based) have a different TR than the task data
-            if TR_array_corrected is not None:
-                task_presence = task_presence[TR_array_corrected]
+            if TR_array is not None:
+                task_presence = task_presence[TR_array]
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "mean":
             threshold = np.mean(event_labels_all_task_hrf)
             task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
             task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
             # some dFC measures (window-based) have a different TR than the task data
-            if TR_array_corrected is not None:
-                task_presence = task_presence[TR_array_corrected]
+            if TR_array is not None:
+                task_presence = task_presence[TR_array]
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "shift":
             task_presence_ratio = np.mean(event_labels_all_task)
@@ -490,8 +482,8 @@ def extract_task_presence(
             task_presence = np.where(event_labels_all_task_hrf > threshold, 1, 0)
             task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
             # some dFC measures (window-based) have a different TR than the task data
-            if TR_array_corrected is not None:
-                task_presence = task_presence[TR_array_corrected]
+            if TR_array is not None:
+                task_presence = task_presence[TR_array]
             indices = np.arange(task_presence.shape[0])
         elif binarizing_method == "GMM":
             task_presence, indices = GMM_binarizing(
@@ -500,7 +492,7 @@ def extract_task_presence(
                 downsample=True,
                 TR_mri=TR_mri,
                 TR_task=TR_task,
-                TR_array=TR_array_corrected,
+                TR_array=TR_array,
             )
         else:
             raise ValueError(
@@ -510,8 +502,8 @@ def extract_task_presence(
         task_presence = event_labels_all_task_hrf
         task_presence = downsample_events_hrf(task_presence, TR_mri, TR_task)
         # some dFC measures (window-based) have a different TR than the task data
-        if TR_array_corrected is not None:
-            task_presence = task_presence[TR_array_corrected]
+        if TR_array is not None:
+            task_presence = task_presence[TR_array]
         indices = np.arange(task_presence.shape[0])
 
     return task_presence, indices

From 4567a9e9220baa8e6ca31c83fc45ab175008aaf4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 23 Sep 2025 20:01:34 -0400
Subject: [PATCH 257/401] change calc_rest_duration and calc_task_duration

---
 pydfc/task_utils.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index adfde63..9635778 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -539,10 +539,7 @@ def calc_task_duration(task_presence, TR_mri):
             task_durations.append((end - start) * TR_mri)
             start = None
     task_durations = np.array(task_durations)
-    # find mean and variance of task durations with division error handling
-    if len(task_durations) == 0:
-        return 0, 0
-    return np.mean(task_durations), np.var(task_durations)
+    return task_durations
 
 
 def calc_rest_duration(task_presence, TR_mri):
@@ -564,10 +561,7 @@ def calc_rest_duration(task_presence, TR_mri):
         end = len(task_presence)
         rest_durations.append((end - start) * TR_mri)
     rest_durations = np.array(rest_durations)
-    # find mean and variance of rest durations with division error handling
-    if len(rest_durations) == 0:
-        return 0, 0
-    return np.mean(rest_durations), np.var(rest_durations)
+    return rest_durations
 
 
 def calc_transition_freq(task_presence):

From e2db81367656145f4f9ebc97acd17f5894fd0f0e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 23 Sep 2025 21:40:26 -0400
Subject: [PATCH 258/401] minor

---
 pydfc/task_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 9635778..aea56bf 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -523,7 +523,7 @@ def calc_relative_task_on(task_presence):
 def calc_task_duration(task_presence, TR_mri):
     """
     task_presence: 0, 1 array
-    return: avg_task_duration, var_task_duration
+    return: list of task_durations
     """
     task_durations = list()
     start = None
@@ -545,7 +545,7 @@ def calc_task_duration(task_presence, TR_mri):
 def calc_rest_duration(task_presence, TR_mri):
     """
     task_presence: 0, 1 array
-    return: avg_rest_duration, var_rest_duration
+    return: list of rest_durations
     """
     rest_durations = list()
     if task_presence[0] == 0:
@@ -555,11 +555,15 @@ def calc_rest_duration(task_presence, TR_mri):
             start = i
         if task_presence[i] == 1 and task_presence[i - 1] == 0:
             end = i
-            rest_durations.append((end - start) * TR_mri)
+            try:
+                rest_durations.append((end - start) * TR_mri)
+            except:
+                print(task_presence[: i + 1])
             start = None
     if task_presence[-1] == 0:
         end = len(task_presence)
-        rest_durations.append((end - start) * TR_mri)
+        if not start is None:
+            rest_durations.append((end - start) * TR_mri)
     rest_durations = np.array(rest_durations)
     return rest_durations
 

From 1617d43ef0844f0a982b66e3432fc4744783b757 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 24 Sep 2025 00:23:45 -0400
Subject: [PATCH 259/401] minor

---
 pydfc/task_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index aea56bf..0f3c131 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -555,10 +555,7 @@ def calc_rest_duration(task_presence, TR_mri):
             start = i
         if task_presence[i] == 1 and task_presence[i - 1] == 0:
             end = i
-            try:
-                rest_durations.append((end - start) * TR_mri)
-            except:
-                print(task_presence[: i + 1])
+            rest_durations.append((end - start) * TR_mri)
             start = None
     if task_presence[-1] == 0:
         end = len(task_presence)

From 5f1ddf7496b44272e0655717f49e103785242307 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 15 Oct 2025 19:20:32 -0400
Subject: [PATCH 260/401] add across dataset scripts

---
 .pre-commit-config.yaml                       |   2 +-
 task_dFC/across_dataset.py                    | 185 ----
 .../LE_embedding_visualization.py             | 195 ++++
 task_dFC/multi_dataset_analysis/cohensd.py    | 462 +++++++++
 .../dfc_visualization.py                      | 142 +++
 .../helper_functions.py                       | 916 ++++++++++++++++++
 task_dFC/multi_dataset_analysis/ml_results.py | 544 +++++++++++
 .../sample_matrix_visualization.py            | 476 +++++++++
 .../task_presence_binarization.py             | 219 +++++
 .../task_timing_stats.py                      | 388 ++++++++
 .../run_scripts_slurm/multi_dataset_info.json |  37 +-
 .../run_across_dataset_analysis.sh            |  52 +
 task_dFC/validation.py                        |  58 --
 13 files changed, 3427 insertions(+), 249 deletions(-)
 delete mode 100644 task_dFC/across_dataset.py
 create mode 100644 task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
 create mode 100644 task_dFC/multi_dataset_analysis/cohensd.py
 create mode 100644 task_dFC/multi_dataset_analysis/dfc_visualization.py
 create mode 100644 task_dFC/multi_dataset_analysis/helper_functions.py
 create mode 100644 task_dFC/multi_dataset_analysis/ml_results.py
 create mode 100644 task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
 create mode 100644 task_dFC/multi_dataset_analysis/task_presence_binarization.py
 create mode 100644 task_dFC/multi_dataset_analysis/task_timing_stats.py
 create mode 100644 task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
 delete mode 100644 task_dFC/validation.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9a7fe62..243bf82 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,7 +37,7 @@ repos:
     rev: v2.2.6
     hooks:
     -   id: codespell
-        args: [--toml, pyproject.toml]
+        args: [--toml, pyproject.toml, -L, whis]
         additional_dependencies: [tomli]
 
 -   repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
diff --git a/task_dFC/across_dataset.py b/task_dFC/across_dataset.py
deleted file mode 100644
index c7f6f9f..0000000
--- a/task_dFC/across_dataset.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import argparse
-import json
-import os
-import traceback
-
-import numpy as np
-
-from pydfc.dfc_utils import dFC_mat2vec
-
-#######################################################################################
-
-
-def get_dataset_info(main_root, dataset):
-    # get the dataset_info.json
-    dataset_info_path = os.path.join(main_root, dataset, "codes", "dataset_info.json")
-    with open(dataset_info_path, "r") as f:
-        dataset_info = json.load(f)
-
-    TASKS = dataset_info["TASKS"]
-    if "RUNS" in dataset_info:
-        RUNS = dataset_info["RUNS"]
-    else:
-        RUNS = None
-    if RUNS is None:
-        RUNS = {task: [None] for task in TASKS}
-
-    if "SESSIONS" in dataset_info:
-        SESSIONS = dataset_info["SESSIONS"]
-    else:
-        SESSIONS = None
-    if SESSIONS is None:
-        SESSIONS = [None]
-
-    if "{dataset}" in dataset_info["main_root"]:
-        dataset_main_root = dataset_info["main_root"].replace("{dataset}", dataset)
-    else:
-        dataset_main_root = dataset_info["main_root"]
-
-    if "{main_root}" in dataset_info["ML_root"]:
-        ML_root = dataset_info["ML_root"].replace("{main_root}", dataset_main_root)
-    else:
-        ML_root = dataset_info["ML_root"]
-
-    return TASKS, RUNS, SESSIONS, ML_root
-
-
-def plot_affinity_matrix(centroids_mat, save_path=None):
-    import matplotlib.pyplot as plt
-    import seaborn as sns
-    from sklearn.neighbors import kneighbors_graph
-
-    fig_dpi = 120
-    fig_bbox_inches = "tight"
-    fig_pad = 0.1
-    save_fig_format = "png"  # pdf, png,
-
-    X = np.array(centroids_mat)  # shape: (n_centroids, n_regions*(n_regions-1)/2)
-
-    affinity_matrix = kneighbors_graph(
-        X,
-        n_neighbors=125,
-        mode="connectivity",
-        include_self=False,
-        metric="correlation",
-    )
-
-    # plot a heatmap of the affinity matrix
-    plt.figure(figsize=(10, 10))
-    sns.heatmap(affinity_matrix.toarray())
-    if save_path is not None:
-        plt.savefig(
-            save_path,
-            format=save_fig_format,
-            bbox_inches=fig_bbox_inches,
-            dpi=fig_dpi,
-            pad_inches=fig_pad,
-        )
-    plt.close()
-
-
-def run_across_dataset_analysis(main_root, DATASETS):
-    """_summary_
-
-    Parameters
-    ----------
-    main_root : str
-        the main root of the datasets
-    DATASETS : list
-        the list of datasets
-    """
-    RESULTS = {
-        "centroids_mat": [],
-        "task": [],
-        "run": [],
-        "session": [],
-        "measure_name": [],
-        "dataset": [],
-    }
-    for dataset in DATASETS:
-
-        TASKS, RUNS, SESSIONS, ML_root = get_dataset_info(main_root, dataset)
-
-        # Load data
-        # look for all centroids files
-        # dataset_root/ML_root/centroids/session/centroids_{session}_{task}_{run}_{measure_name}.npy
-        for session in SESSIONS:
-            if session is None:
-                input_path = os.path.join(ML_root, "centroids")
-            else:
-                input_path = os.path.join(ML_root, "centroids", session)
-            ALL_CENTROIDS_FILES = os.listdir(input_path)
-            ALL_CENTROIDS_FILES = [f for f in ALL_CENTROIDS_FILES if "centroids_" in f]
-            for task in TASKS:
-                for run in RUNS[task]:
-                    centroids_files = [f for f in ALL_CENTROIDS_FILES if f"_{task}_" in f]
-                    if run is not None:
-                        centroids_files = [f for f in centroids_files if f"_{run}_" in f]
-                    if session is not None:
-                        centroids_files = [
-                            f for f in centroids_files if f"_{session}_" in f
-                        ]
-                    for centroids_file in centroids_files:
-                        measure_name = centroids_file.split("_")[-1].replace(".npy", "")
-                        centroids = np.load(os.path.join(input_path, centroids_file))
-                        centroids_mat = centroids[
-                            "centroids_mat"
-                        ]  # shape: (n_clusters, n_regions, n_regions)
-                        centroids_mat = dFC_mat2vec(
-                            centroids_mat
-                        )  # shape: (n_clusters, n_regions*(n_regions-1)/2)
-                        for i in range(centroids_mat.shape[0]):
-                            RESULTS["centroids_mat"].append(centroids_mat[i])
-                            RESULTS["task"].append(task)
-                            RESULTS["run"].append(run)
-                            RESULTS["session"].append(session)
-                            RESULTS["measure_name"].append(measure_name)
-                            RESULTS["dataset"].append(dataset)
-
-    # give statistics
-    print(f"Number of centroids: {len(RESULTS['centroids_mat'])}")
-    print(f"Number of tasks: {len(set(RESULTS['task']))}")
-    print(f"Number of measure_names: {len(set(RESULTS['measure_name']))}")
-    print(f"Number of datasets: {len(set(RESULTS['dataset']))}")
-
-    # plot the affinity matrix
-    plot_affinity_matrix(RESULTS["centroids_mat"], save_path="affinity_matrix.png")
-
-
-#######################################################################################
-
-if __name__ == "__main__":
-    # argparse
-    HELPTEXT = """
-    Script to run across-dataset analysis on dFC results.
-    """
-
-    parser = argparse.ArgumentParser(description=HELPTEXT)
-
-    parser.add_argument(
-        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
-    )
-
-    args = parser.parse_args()
-
-    multi_dataset_info = args.multi_dataset_info
-
-    # Read dataset info
-    with open(multi_dataset_info, "r") as f:
-        multi_dataset_info = json.load(f)
-
-    print("Multi-Dataset Analysis started ...")
-
-    main_root = multi_dataset_info["main_root"]
-    DATASETS = multi_dataset_info["DATASETS"]
-
-    try:
-        run_across_dataset_analysis()
-    except Exception as e:
-        print(f"Error in run_across_dataset_analysis: {e}")
-        traceback.print_exc()
-    print("run_across_dataset_analysis finished.")
-
-    print("Multi-Dataset Analysis finished.")
-
-#######################################################################################
diff --git a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
new file mode 100644
index 0000000..4f278b2
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
@@ -0,0 +1,195 @@
+import argparse
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import silhouette_score
+
+from pydfc.ml_utils import (
+    dFC_feature_extraction,
+    embed_dFC_features,
+    find_available_subjects,
+    process_SB_features,
+)
+
+fig_dpi = 120
+fig_bbox_inches = "tight"
+fig_pad = 0.1
+show_title = True
+save_fig_format = "png"  # pdf, png,
+
+normalize_dFC = False
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to analyze and visualize LE-transformed features across multiple datasets.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+
+    output_root = (
+        f"{multi_dataset_info['output_root']}/task_presence_embed/{simul_or_real}"
+    )
+
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
+    for dataset in DATASETS:
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for session in SESSIONS:
+            for task_id, task in enumerate(TASKS):
+                for run in RUNS[task][:1]:
+                    for dFC_id in range(7):
+                        try:
+                            SUBJECTS = find_available_subjects(
+                                dFC_root=dFC_root,
+                                task=task,
+                                dFC_id=dFC_id,
+                                session=session,
+                                run=run,
+                            )
+                            if len(SUBJECTS) == 0:
+                                print(
+                                    f"No subjects found for task {task}, dFC_id {dFC_id}, session {session}, run {run}."
+                                )
+                                continue
+                            SUBJECTS = SUBJECTS[0:1]
+                            print(f"Number of subjects: {len(SUBJECTS)}")
+
+                            X, _, y, _, subj_label, _, measure_name = (
+                                dFC_feature_extraction(
+                                    task=task,
+                                    train_subjects=SUBJECTS,
+                                    test_subjects=[],
+                                    dFC_id=dFC_id,
+                                    roi_root=roi_root,
+                                    dFC_root=dFC_root,
+                                    run=run,
+                                    session=session,
+                                    dynamic_pred="no",
+                                    normalize_dFC=normalize_dFC,
+                                    FCS_proba_for_SB=True,
+                                )
+                            )
+
+                            assert (
+                                X.shape[0] == y.shape[0]
+                            ), "Number of samples do not match."
+                            assert (
+                                X.shape[0] == subj_label.shape[0]
+                            ), "Number of samples do not match."
+
+                            if measure_name in [
+                                "CAP",
+                                "Clustering",
+                                "ContinuousHMM",
+                                "DiscreteHMM",
+                                "Windowless",
+                            ]:
+                                X = process_SB_features(X=X, measure_name=measure_name)
+
+                            print(f"Task: {task}")
+                            print(measure_name)
+                            print(X.shape, y.shape)
+                            print(silhouette_score(X, y))
+
+                            # embed the features
+                            # n_components = "auto"
+                            n_components = 3
+                            X_embedded, _ = embed_dFC_features(
+                                train_subjects=SUBJECTS,
+                                test_subjects=[],
+                                X_train=X,
+                                X_test=None,
+                                y_train=y,
+                                y_test=None,
+                                subj_label_train=subj_label,
+                                subj_label_test=None,
+                                embedding="PCA",
+                                n_components=n_components,
+                                n_neighbors_LE=125,
+                                LE_embedding_method="embed+procrustes",
+                            )
+                            # X_embedded = TSNE(n_components=n_components, learning_rate='auto', init='random', perplexity=125, metric="correlation").fit_transform(X)
+                            print(silhouette_score(X_embedded, y))
+                            print(X_embedded.shape)
+
+                            # plot
+                            fig = plt.figure(figsize=(7, 7))
+                            ax = fig.add_subplot(111, projection="3d")
+                            for label in np.unique(y):
+                                ax.scatter(
+                                    X_embedded[y == label, 0],
+                                    X_embedded[y == label, 1],
+                                    X_embedded[y == label, 2],
+                                    label=["rest", "task"][label],
+                                    s=20,
+                                )
+                            plt.legend()
+
+                            plt.savefig(
+                                f"{output_root}/task_presence_embed_{task}_{measure_name}.png",
+                                dpi=fig_dpi,
+                                bbox_inches=fig_bbox_inches,
+                                pad_inches=fig_pad,
+                                format=save_fig_format,
+                            )
+
+                            plt.close()
+                        except Exception as e:
+                            print(
+                                f"Error processing task {task}, dFC_id {dFC_id}, session {session}, run {run}: {e}"
+                            )
+                            continue
diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
new file mode 100644
index 0000000..28f6d49
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -0,0 +1,462 @@
+import argparse
+import json
+
+import matplotlib.pyplot as plt
+import nibabel as nib
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from nilearn import datasets, plotting
+
+from pydfc import data_loader
+from pydfc.ml_utils import find_available_subjects, load_task_data
+from pydfc.task_utils import cohen_d_bold, extract_task_presence
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to compute and visualize Cohen's d effect sizes for task vs. rest BOLD signals across multiple datasets.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    main_root = multi_dataset_info["real_data"]["main_root"]
+    DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+    TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    output_root = f"{multi_dataset_info['output_root']}/CohensD"
+
+    CohensD_across_task = {
+        "task": [],
+        "d_values": [],
+        "dataset": [],
+        "ROI": [],
+    }
+    for dataset in DATASETS:
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for task in TASKS:
+            if task not in TASKS_to_include:
+                print(f"Skipping task {task} as it's not in the inclusion list.")
+                continue
+            d_values_all = []
+            for session in SESSIONS:
+                print(f"Processing task: {task}")
+                SUBJECTS = find_available_subjects(
+                    dFC_root=dFC_root,
+                    task=task,
+                    dFC_id=None,
+                    session=session,
+                )
+                for subj in SUBJECTS:
+                    for run in RUNS[task]:
+                        try:
+                            task_data = load_task_data(
+                                roi_root=roi_root,
+                                subj=subj,
+                                task=task,
+                                run=run,
+                                session=session,
+                            )
+                        except:
+                            continue
+
+                        if run is None:
+                            if session is None:
+                                BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+                            else:
+                                BOLD_file_name = (
+                                    "{subj_id}_{session}_{task}_time-series.npy"
+                                )
+                        else:
+                            if session is None:
+                                BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+                            else:
+                                BOLD_file_name = (
+                                    "{subj_id}_{session}_{task}_{run}_time-series.npy"
+                                )
+                        try:
+                            BOLD = data_loader.load_TS(
+                                data_root=roi_root,
+                                file_name=BOLD_file_name,
+                                subj_id2load=subj,
+                                task=task,
+                                session=session,
+                                run=run,
+                            )
+                        except Exception as e:
+                            print(f"Error loading BOLD data: {e}")
+                            continue
+                        BOLD_data = BOLD.data  # np.ndarray (n_ROIs, n_TRs)
+
+                        Fs_task = task_data["Fs_task"]
+                        TR_task = 1 / Fs_task
+
+                        TR_array = np.arange(0, BOLD_data.shape[1])
+                        task_presence, indices = extract_task_presence(
+                            event_labels=task_data["event_labels"],
+                            TR_task=TR_task,
+                            TR_mri=task_data["TR_mri"],
+                            binary=True,
+                            binarizing_method="GMM",
+                            no_hrf=False,
+                            TR_array=TR_array,
+                        )
+
+                        # if n_TRs do not match, align them
+                        if BOLD_data.shape[1] != task_presence.shape[0]:
+                            print(
+                                f"Before alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                            )
+                            min_TRs = min(BOLD_data.shape[1], task_presence.shape[0])
+                            task_presence = task_presence[:min_TRs]
+                            BOLD_data = BOLD_data[:, :min_TRs]
+                            print(
+                                f"After alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                            )
+                            # also adjust indices
+                            indices = [i for i in indices if i < min_TRs]
+                        task_presence = task_presence[indices]  # (n_TRs,)
+                        BOLD_data = BOLD_data[:, indices]  # (n_ROIs, n_TRs)
+
+                        assert BOLD_data.shape[1] == task_presence.shape[0]
+
+                        cohen_d = cohen_d_bold(X=BOLD_data.T, y=task_presence)
+                        d_values_all.append(cohen_d)
+
+            if len(d_values_all) == 0:
+                print(f"No data found for task {task} in dataset {dataset}. Skipping.")
+                continue
+            d_values_all = np.array(d_values_all)  # (n_subjectsxrunsxsessions, n_ROIs)
+            avg_d_values = np.nanmean(d_values_all, axis=0)  # (n_ROIs,)
+            CohensD_across_task["d_values"].extend(avg_d_values)
+            CohensD_across_task["task"].extend([task] * len(avg_d_values))
+            CohensD_across_task["dataset"].extend([dataset] * len(avg_d_values))
+            CohensD_across_task["ROI"].extend(BOLD.node_labels)
+
+            # plot d values on a glass brain
+            coords = BOLD.locs
+
+            template_img = datasets.load_mni152_template()
+            data = np.zeros(template_img.shape)
+            affine = template_img.affine
+
+            # Create a small sphere for each coordinate
+            radius = 5  # in voxels
+            for c, d in zip(coords, avg_d_values):
+                ijk = np.round(nib.affines.apply_affine(np.linalg.inv(affine), c)).astype(
+                    int
+                )
+                x, y, z = ijk
+                for i in range(-radius, radius + 1):
+                    for j in range(-radius, radius + 1):
+                        for k in range(-radius, radius + 1):
+                            if i**2 + j**2 + k**2 <= radius**2:
+                                xi, yj, zk = x + i, y + j, z + k
+                                if (
+                                    (0 <= xi < data.shape[0])
+                                    and (0 <= yj < data.shape[1])
+                                    and (0 <= zk < data.shape[2])
+                                ):
+                                    data[xi, yj, zk] = d
+
+            d_img = nib.Nifti1Image(data, affine)
+
+            plotting.plot_glass_brain(
+                d_img,
+                display_mode="ortho",
+                colorbar=True,
+                plot_abs=False,
+                cmap="coolwarm",
+                vmax=np.max(avg_d_values),
+            )
+
+            plt.savefig(
+                f"{output_root}/cohensd_region_{task}.png",
+                dpi=120,
+                bbox_inches="tight",
+                pad_inches=0.1,
+                format="png",
+            )
+
+            plt.close()
+
+            # Load Schaefer atlas (100 parcels)
+            schaefer = datasets.fetch_atlas_schaefer_2018(n_rois=100)
+
+            # atlas_img is the path to the NIfTI file; load it
+            atlas_img = nib.load(schaefer["maps"])
+            labels = schaefer["labels"]  # list of labels
+            labels = [label.decode() for label in labels]
+            # check that the labels match BOLD.node_labels
+            assert all(
+                i == j for i, j in zip(labels, BOLD.node_labels)
+            ), "Labels do not match!"
+
+            atlas_data = atlas_img.get_fdata()
+            cohen_img_data = np.zeros(atlas_data.shape)
+
+            for i, d in enumerate(avg_d_values):
+                cohen_img_data[atlas_data == (i + 1)] = d  # labels start from 1
+
+            cohen_img = nib.Nifti1Image(cohen_img_data, affine=atlas_img.affine)
+
+            plotting.plot_glass_brain(
+                cohen_img,
+                display_mode="ortho",
+                colorbar=True,
+                cmap="coolwarm",
+                plot_abs=False,
+                vmax=np.max(avg_d_values),
+            )
+
+            plt.savefig(
+                f"{output_root}/cohensd_voxel_{task}.png",
+                dpi=120,
+                bbox_inches="tight",
+                pad_inches=0.1,
+                format="png",
+            )
+
+            plt.close()
+
+    # --- Across-task correlation with ML performance (ABSOLUTE Cohen's d) ---
+    # Load ALL_ML_SCORES
+    ALL_ML_SCORES = np.load(
+        f"{multi_dataset_info['output_root']}/ML_results/ALL_ML_SCORES_real.npy",
+        allow_pickle=True,
+    ).item()
+
+    embedding = "LE"
+    metric = "SVM balanced accuracy"
+    GROUP = "test"
+
+    # Build dataframe if not already done
+    DF = pd.DataFrame.from_dict(CohensD_across_task)
+
+    # Use absolute Cohen's d
+    DF["abs_d"] = DF["d_values"].abs()
+
+    # Choose an order (sort tasks by their MAX |d| to align with Fig. 2)
+    max_abs_per_task = (
+        DF.groupby("task")["abs_d"]
+        .max()
+        .sort_values(ascending=False)
+        .reset_index(name="abs_max")
+    )
+
+    df = pd.DataFrame.from_dict(ALL_ML_SCORES)
+    df = df[df["task"].isin(TASKS_to_include)]
+    df = df[(df["embedding"] == embedding) & (df["group"] == GROUP)]
+
+    # alphabetical method order
+    method_order = sorted(df["dFC method"].unique(), key=lambda s: s.lower())
+    df["dFC method"] = pd.Categorical(
+        df["dFC method"], categories=method_order, ordered=True
+    )
+
+    # ===== build BEST and ACROSS tables =====
+    counts_task = df.groupby("task")["run"].nunique()
+    multi_tasks = counts_task[counts_task > 1].index
+    df_multi = df[
+        df["task"].isin(multi_tasks)
+    ]  # <- use this dataframe for ACROSS figures
+
+    # BEST: one row per (task, method) with the winning run kept
+    df_best = (
+        df.sort_values(["task", "dFC method", metric], ascending=[True, True, False])
+        .drop_duplicates(subset=["task", "dFC method"], keep="first")
+        .rename(columns={metric: "score"})
+    )
+
+    # keep only the task and score columns
+    df_best = df_best[["task", "score"]]
+
+    # average over dFC methods and make a new dataframe
+    df_best = df_best.groupby("task").agg({"score": "mean"}).reset_index()
+    # find the correlation between max_abs_per_task["abs_max"] and df_best['score']
+    merged = pd.merge(max_abs_per_task, df_best, on="task")
+
+    # task="task-ppalocalizer" is an outlier, show it as a different color and exclude it from the correlation calculation
+    outlier = merged[merged["task"] == "task-ppalocalizer"]
+    merged = merged[merged["task"] != "task-ppalocalizer"]
+    plt.style.use("seaborn-v0_8-paper")
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})
+    sns.set_style("darkgrid")
+    plt.figure(figsize=(10, 8))
+    sns.scatterplot(
+        x="abs_max", y="score", data=merged, s=60, edgecolor="k", label="Task Paradigms"
+    )
+    sns.scatterplot(
+        x="abs_max",
+        y="score",
+        data=outlier,
+        color="orange",
+        s=80,
+        edgecolor="k",
+        label="Outlier: task-ppalocalizer",
+    )
+
+    # fit and plot regression line
+    sns.regplot(
+        x="abs_max",
+        y="score",
+        data=merged,
+        scatter=False,
+        color="red",
+        line_kws={"label": "Best fit"},
+    )
+
+    plt.xlabel("Max |Cohen's d| per Task", fontweight="bold", fontsize=14)
+    plt.ylabel("SVM Balanced Accuracy", fontweight="bold", fontsize=14)
+    plt.legend(fontsize=12)
+    correlation = merged["abs_max"].corr(merged["score"])
+    plt.text(
+        0.05,
+        0.95,
+        f"correlation  r = {correlation:.2f}",
+        transform=plt.gca().transAxes,
+        fontsize=17,
+        fontweight="bold",
+        verticalalignment="top",
+    )
+
+    plt.xticks(fontweight="bold", fontsize=12)
+    plt.yticks(fontweight="bold", fontsize=12)
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig(
+        f"{output_root}/CohensdCorr.png",
+        dpi=150,
+        bbox_inches="tight",
+        pad_inches=0.2,
+        format="png",
+    )
+    plt.close()
+
+    # --- Across-task visualizations (ABSOLUTE Cohen's d) ---
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})
+    sns.set_style("darkgrid")
+
+    # Build dataframe if not already done
+    DF = pd.DataFrame.from_dict(CohensD_across_task)
+
+    # Use absolute Cohen's d
+    DF["abs_d"] = DF["d_values"].abs()
+
+    # Choose an order (sort tasks by their MAX |d| to align with Fig. 2)
+    max_abs_per_task = (
+        DF.groupby("task")["abs_d"]
+        .max()
+        .sort_values(ascending=False)
+        .reset_index(name="abs_max")
+    )
+    task_order = max_abs_per_task["task"].tolist()
+
+    # Dynamic width so labels don't collide (0.6 inch per task, min 14 inches)
+    fig_width = max(14, 0.6 * len(task_order))
+
+    # -------- Figure 1: Boxplot of |Cohen's d| per task with individual samples --------
+    plt.figure(figsize=(fig_width, 7))
+
+    # Boxplot (hide outliers to avoid double-plotting with the samples)
+    ax = sns.boxplot(
+        data=DF, x="task", y="abs_d", order=task_order, showfliers=False, width=0.6
+    )
+
+    # Overlay individual samples (one point per ROI sample)
+    sns.stripplot(
+        data=DF,
+        x="task",
+        y="abs_d",
+        order=task_order,
+        dodge=False,
+        jitter=0.25,
+        size=2,
+        alpha=0.45,
+        ax=ax,
+    )
+
+    ax.set_xlabel("Task")
+    ax.set_ylabel("|Cohen's d|")
+    ax.set_ylim(bottom=0)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.tight_layout()
+
+    plt.savefig(
+        f"{output_root}/CohensD_abs_boxplot_with_samples_per_task.png",
+        dpi=150,
+        bbox_inches="tight",
+        pad_inches=0.2,
+        format="png",
+    )
+    plt.close()
+
+    # -------- Figure 2: Max |Cohen's d| across ROIs per task --------
+    plt.figure(figsize=(fig_width, 6))
+
+    ax = sns.barplot(data=max_abs_per_task, x="task", y="abs_max", order=task_order)
+
+    # Optional: annotate bars with values (trim to 2 decimals)
+    for p in ax.patches:
+        height = p.get_height()
+        ax.annotate(
+            f"{height:.2f}",
+            (p.get_x() + p.get_width() / 2.0, height),
+            ha="center",
+            va="bottom",
+            xytext=(0, 2),
+            textcoords="offset points",
+            fontsize=8,
+        )
+
+    ax.set_xlabel("Task")
+    ax.set_ylabel("Max |Cohen's d|")
+    ax.set_ylim(bottom=0)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.tight_layout()
+
+    plt.savefig(
+        f"{output_root}/CohensD_abs_max_per_task.png",
+        dpi=150,
+        bbox_inches="tight",
+        pad_inches=0.2,
+        format="png",
+    )
+    plt.close()
diff --git a/task_dFC/multi_dataset_analysis/dfc_visualization.py b/task_dFC/multi_dataset_analysis/dfc_visualization.py
new file mode 100644
index 0000000..b4395ef
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/dfc_visualization.py
@@ -0,0 +1,142 @@
+import argparse
+import json
+import os
+import sys
+
+from pydfc.dfc_utils import TR_intersection, rank_norm
+from pydfc.ml_utils import find_available_subjects, load_dFC
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    figure_dfc_matrices_window_png,
+)
+
+normalize_dFC = True
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to make figures/tables from multi-dataset ML results.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    print("Multi-Dataset Analysis started ...")
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+    output_root = f"{multi_dataset_info['output_root']}/dFC/{simul_or_real}"
+
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
+    for dataset in DATASETS:
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        DATA = {}
+        for dFC_id in range(0, 7):
+            for session in SESSIONS[:1]:  # Only process the first session
+                for task_id, task in enumerate(TASKS):
+                    for run in RUNS[task][:1]:  # Only process the first run
+                        print(
+                            f"Processing dataset: {dataset}, task: {task}, run: {run}, session: {session}, dFC_id: {dFC_id}"
+                        )
+
+                        SUBJECTS = find_available_subjects(
+                            dFC_root=dFC_root,
+                            task=task,
+                            dFC_id=dFC_id,
+                            session=session,
+                            run=run,
+                        )
+                        if len(SUBJECTS) == 0:
+                            print(
+                                f"No subjects found for dataset: {dataset}, task: {task}, run: {run}, session: {session}, dFC_id: {dFC_id}"
+                            )
+                            continue
+
+                        subj = SUBJECTS[0]  # Only process the first subject
+
+                        dFC = load_dFC(
+                            dFC_root=dFC_root,
+                            subj=subj,
+                            task=task,
+                            dFC_id=dFC_id,
+                            run=run,
+                            session=session,
+                        )
+
+                        if not task in DATA:
+                            DATA[task] = {}
+                        DATA[task][dFC.measure.measure_name] = dFC
+
+        # visualize the dFC matrices for each task
+        for task in DATA.keys():
+            # first find common TRs across measures
+            common_TRs = TR_intersection(
+                [DATA[task][measure_name] for measure_name in DATA[task]]
+            )
+
+            dFC_mat_dict = {}
+            for measure_name in DATA[task]:
+                dFC = DATA[task][measure_name]
+                dFC_mat = dFC.get_dFC_mat(TRs=common_TRs)
+                if normalize_dFC:
+                    dFC_mat = rank_norm(dFC_mat)
+                dFC_mat_dict[measure_name] = dFC_mat
+            figure_dfc_matrices_window_png(
+                dFC_mat_dict,
+                common_TRs,
+                window_len=10,
+                cmap="plasma",
+                outfile=f"{output_root}/dFC_{dataset}_{task}_mid_10.png",
+                dpi=600,
+            )
+
+        print(f"Saved data for dataset {dataset}")
diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
new file mode 100644
index 0000000..bb3a768
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -0,0 +1,916 @@
+import colorsys
+import math
+import re
+import textwrap
+from pathlib import Path
+
+import matplotlib as mpl
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import matplotlib.transforms as mtransforms
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+###################### Publication style ######################
+
+
+def setup_pub_style():
+    sns.set_theme(context="paper", style="whitegrid")
+    mpl.rcParams.update(
+        {
+            # Fonts & text
+            "font.size": 10,  # base
+            "axes.titlesize": 12,
+            "axes.labelsize": 11,
+            "xtick.labelsize": 9,
+            "ytick.labelsize": 9,
+            "legend.fontsize": 9,
+            "figure.titlesize": 13,
+            "axes.titlepad": 8,
+            "axes.labelpad": 6,
+            # Lines/markers
+            "lines.linewidth": 1.5,
+            "lines.markersize": 5,
+            "axes.linewidth": 0.8,
+            "grid.linewidth": 0.6,
+            # Figure/layout
+            "figure.dpi": 150,  # on-screen
+            "savefig.dpi": 500,  # export
+            "savefig.bbox": "tight",
+            "savefig.pad_inches": 0.04,
+            # Vector export: keep text as text in PDF/SVG
+            "pdf.fonttype": 42,
+            "ps.fonttype": 42,
+        }
+    )
+
+
+def savefig_pub(path_png_or_pdf: str):
+    Path(Path(path_png_or_pdf).parent).mkdir(parents=True, exist_ok=True)
+    plt.savefig(path_png_or_pdf)
+    # # Also export vector PDF alongside PNG unless you passed a .pdf
+    # p = Path(path_png_or_pdf)
+    # if p.suffix.lower() != ".pdf":
+    #     plt.savefig(p.with_suffix(".pdf"))
+
+
+###################### ml_results ######################
+
+
+def get_cog_domain_info(simul_or_real: str):
+    """
+    Return:
+        DOMAIN_ORDER: list of domains in preferred order
+        TASK2DOMAIN: dict mapping canonical task codes to domains
+        DOMAIN_BASE: dict mapping domains to base colors (hex)
+    """
+    if simul_or_real == "real":
+        # --- Cognitive-Atlas–aligned domains (order on paper) ---
+        DOMAIN_ORDER = [
+            "Language",
+            "Numerical cognition",
+            "Cognitive control",
+            "Working memory",
+            "Attention",
+            "Decision-making & valuation",
+            "Emotion & social processes",
+            "Cue reactivity / craving",
+            "Pain / nociception",
+            "Sensorimotor",
+            "Perception & naturalistic memory",
+            "Neurofeedback",
+            "Functional localizers",
+            "Other",
+        ]
+
+        # --- Map canonical task codes -> domain ---
+        TASK2DOMAIN = {
+            # Language
+            "audrhyme": "Language",
+            "audsem": "Language",
+            "audspell": "Language",
+            "visrhyme": "Language",
+            "vissem": "Language",
+            "visspell": "Language",
+            "speech": "Language",
+            # Numerical
+            "arithmetic": "Numerical cognition",
+            # Cognitive control
+            "stroop": "Cognitive control",
+            "gstroop": "Cognitive control",
+            "cuedts": "Cognitive control",
+            "axcpt": "Cognitive control",
+            "matching": "Cognitive control",
+            # Working memory
+            "stern": "Working memory",
+            "vswm": "Working memory",
+            "workingmemory": "Working memory",
+            # Attention
+            "spatialdetection": "Attention",
+            "oddball": "Attention",
+            # Decision-making & valuation
+            "bart": "Decision-making & valuation",
+            "risk": "Decision-making & valuation",
+            "itc": "Decision-making & valuation",
+            "delaydiscounting": "Decision-making & valuation",
+            "mgt": "Decision-making & valuation",
+            # Emotion & social
+            "emomatching": "Emotion & social processes",
+            "anticipation": "Emotion & social processes",
+            "fearlearning": "Emotion & social processes",
+            "emotionregulation": "Emotion & social processes",
+            "faces": "Emotion & social processes",
+            # Cue reactivity
+            "cic": "Cue reactivity / craving",
+            # Pain
+            "paingen": "Pain / nociception",
+            # Sensorimotor
+            "motor": "Sensorimotor",
+            "execution": "Sensorimotor",
+            "imagery": "Sensorimotor",
+            "ihg": "Sensorimotor",
+            # Perception & naturalistic memory
+            "expo": "Perception & naturalistic memory",
+            "recall": "Perception & naturalistic memory",
+            # Methodological — Neurofeedback
+            "feedback": "Neurofeedback",
+            # Methodological — Functional localizers
+            "ppalocalizer": "Functional localizers",
+            "floc": "Functional localizers",
+            "fribbids": "Functional localizers",
+            "midloc": "Functional localizers",
+            "localiser": "Functional localizers",
+            "localizer": "Functional localizers",
+        }
+        # base colors per domain (distinct, colorblind-friendly)
+        DOMAIN_BASE = {
+            "Language": "#1f77b4",
+            "Numerical cognition": "#ff7f0e",
+            "Cognitive control": "#02833E",
+            "Working memory": "#d62728",
+            "Attention": "#9467bd",
+            "Decision-making & valuation": "#8c564b",
+            "Emotion & social processes": "#e377c2",
+            "Cue reactivity / craving": "#D337D5",
+            "Pain / nociception": "#bcbd22",
+            "Sensorimotor": "#17becf",
+            "Perception & naturalistic memory": "#1f9e89",
+            "Neurofeedback": "#d0e81f",
+            "Functional localizers": "#35cf33",
+            "Other": "#646464",
+        }
+    elif simul_or_real == "simul":
+        # --- Categories of simulated task paradigms ---
+        DOMAIN_ORDER = [
+            "Simulated Periodic",
+            "Good Paradigm Design, Strong Performance on Real Data",
+            "Good Paradigm Design, Poor Performance on Real Data",
+            "Poor Paradigm Design, Poor Performance on Real Data",
+        ]
+        # --- Map task codes -> category ---
+        TASK2DOMAIN = {
+            # Simulated Periodic
+            "lowfreqlongrest": "Simulated Periodic",
+            "lowfreqshortrest": "Simulated Periodic",
+            "lowfreqshorttask": "Simulated Periodic",
+            # Good Paradigm Design, Strong Performance on Real Data
+            "axcpt": "Good Paradigm Design, Strong Performance on Real Data",
+            "stern": "Good Paradigm Design, Strong Performance on Real Data",
+            "cuedts": "Good Paradigm Design, Strong Performance on Real Data",
+            # Good Paradigm Design, Poor Performance on Real Data
+            "execution": "Good Paradigm Design, Poor Performance on Real Data",
+            "imagery": "Good Paradigm Design, Poor Performance on Real Data",
+            "localizer": "Good Paradigm Design, Poor Performance on Real Data",
+            "ppalocalizer": "Good Paradigm Design, Poor Performance on Real Data",
+            # Poor Paradigm Design, Poor Performance on Real Data
+            "itc": "Poor Paradigm Design, Poor Performance on Real Data",
+            "stroop": "Poor Paradigm Design, Poor Performance on Real Data",
+            "risk": "Poor Paradigm Design, Poor Performance on Real Data",
+        }
+        # base colors per domain (distinct, colorblind-friendly)
+        DOMAIN_BASE = {
+            "Simulated Periodic": "#1f77b4",
+            "Good Paradigm Design, Strong Performance on Real Data": "#ff7f0e",
+            "Good Paradigm Design, Poor Performance on Real Data": "#02833E",
+            "Poor Paradigm Design, Poor Performance on Real Data": "#d62728",
+        }
+    else:
+        raise ValueError(f"Invalid simul_or_real: {simul_or_real}")
+    return DOMAIN_ORDER, TASK2DOMAIN, DOMAIN_BASE
+
+
+def canon_task(task_str: str) -> str:
+    """strip 'task-' and non-letters, lowercase → canonical key"""
+    s = task_str.replace("task-", "")
+    s = re.sub(r"[^a-zA-Z]", "", s)
+    return s.lower()
+
+
+def task_domain_real(task: str) -> str:
+    _, TASK2DOMAIN, _ = get_cog_domain_info("real")
+    return TASK2DOMAIN.get(canon_task(task), "Other")
+
+
+def task_domain_simul(task: str) -> str:
+    _, TASK2DOMAIN, _ = get_cog_domain_info("simul")
+    return TASK2DOMAIN.get(canon_task(task), "Other")
+
+
+def shade_series_same_hue(base_hex: str, n: int, delta_L=0.08, delta_S=0.06):
+    """
+    Same hue; small, symmetric tweaks in lightness/saturation → very similar colors.
+    delta_L/S control how similar the shades are (smaller = more similar).
+    """
+    if n <= 1:
+        return [base_hex]
+    r, g, b = mcolors.to_rgb(base_hex)
+    # colorsys uses HLS (Hue, Lightness, Saturation)
+    h, l, s = colorsys.rgb_to_hls(r, g, b)
+
+    # symmetric lightness offsets around original l
+    offs_L = np.linspace(-delta_L, +delta_L, n)
+    # small saturation jitter to avoid identical look
+    offs_S = np.linspace(-delta_S, +delta_S, n)
+
+    cols = []
+    for dL, dS in zip(offs_L, offs_S):
+        li = float(np.clip(l + dL, 0.05, 0.95))
+        si = float(np.clip(s + dS, 0.20, 0.95))
+        r2, g2, b2 = colorsys.hls_to_rgb(h, li, si)
+        cols.append(mcolors.to_hex((r2, g2, b2)))
+    return cols
+
+
+def build_task_order_and_palette(
+    tasks_iterable, simul_or_real, similarity_L=0.08, similarity_S=0.06
+):
+    """Domain-first task order + very-similar shades per domain."""
+    tasks = list(tasks_iterable)
+    if simul_or_real == "real":
+        dom_of = {t: task_domain_real(t) for t in tasks}
+    elif simul_or_real == "simul":
+        dom_of = {t: task_domain_simul(t) for t in tasks}
+
+    DOMAIN_ORDER, _, DOMAIN_BASE = get_cog_domain_info(simul_or_real)
+    # order: by DOMAIN_ORDER, then alphabetical within domain
+    task_order = []
+    for dom in DOMAIN_ORDER:
+        ts = sorted([t for t in tasks if dom_of[t] == dom], key=lambda s: s.lower())
+        task_order.extend(ts)
+
+    # palette: near-identical shades per domain
+    palette = {}
+    for dom in DOMAIN_ORDER:
+        ts = [t for t in task_order if dom_of.get(t, "Other") == dom]
+        if not ts:
+            continue
+        shades = shade_series_same_hue(
+            DOMAIN_BASE[dom], len(ts), delta_L=similarity_L, delta_S=similarity_S
+        )
+        for t, col in zip(ts, shades):
+            palette[t] = col
+    return task_order, palette
+
+
+def domain_sorted_rows(index_tasks, TASKS_to_include, simul_or_real):
+    # preserve only tasks present in the matrix
+    present = [t for t in index_tasks if t in TASKS_to_include]
+    # if simul_or_real != "real":
+    #     return sorted(present, key=lambda s: s.lower())
+    # domain-first, then alphabetical
+    if simul_or_real == "real":
+        dom_of = {t: task_domain_real(t) for t in present}
+    elif simul_or_real == "simul":
+        dom_of = {t: task_domain_simul(t) for t in present}
+    DOMAIN_ORDER, _, _ = get_cog_domain_info(simul_or_real)
+    ordered = []
+    for dom in DOMAIN_ORDER:
+        ts = sorted([t for t in present if dom_of[t] == dom], key=lambda s: s.lower())
+        ordered.extend(ts)
+    return ordered
+
+
+def boldify_axes(ax, xlabel=None, ylabel=None, rotate_xticks=35):
+    if xlabel is not None:
+        ax.set_xlabel(xlabel, fontweight="bold")
+    if ylabel is not None:
+        ax.set_ylabel(ylabel, fontweight="bold")
+    # dFC method names on x-axis
+    if rotate_xticks is not None:
+        plt.setp(
+            ax.get_xticklabels(), fontweight="bold", rotation=rotate_xticks, ha="right"
+        )
+    else:
+        plt.setp(ax.get_xticklabels(), fontweight="bold")
+
+
+def draw_grouped_legend_panel(
+    ax_leg,
+    task_order,
+    domain_of,
+    palette,
+    domain_order,
+    ncols=2,
+    fontsize=8,
+    markersize=5,
+    colpad=0.04,
+):
+    ax_leg.set_axis_off()
+    ax_leg.set_xlim(0, 1)
+    ax_leg.set_ylim(0, 1)
+    items = []
+    for dom in domain_order:
+        ts = [t for t in task_order if domain_of.get(t, "Other") == dom]
+        if not ts:
+            continue
+        items.append(("header", dom))
+        items.extend(("task", t) for t in ts)
+
+    rows = len(items)
+    rows_per_col = max(1, math.ceil(rows / ncols))
+    x_cols = [0.02 + i * (1.0 / ncols) for i in range(ncols)]
+    top = 0.98
+    dy = (top - 0.06) / rows_per_col
+
+    col = 0
+    row_in_col = 0
+    for kind, val in items:
+        if row_in_col >= rows_per_col:
+            col += 1
+            row_in_col = 0
+        if col >= ncols:
+            break
+        x = x_cols[col]
+        y = top - row_in_col * dy
+        if kind == "header":
+            ax_leg.text(
+                x, y, val, fontsize=fontsize, fontweight="bold", ha="left", va="top"
+            )
+        else:
+            t = val
+            color = palette.get(t, "0.4")
+            ax_leg.plot(
+                [x],
+                [y],
+                marker="o",
+                ms=markersize,
+                mfc=color,
+                mec="#222222",
+                mew=0.8,
+                ls="None",
+            )
+            ax_leg.text(x + colpad, y, t, fontsize=fontsize, ha="left", va="center")
+        row_in_col += 1
+
+
+def mean_ci_boot(y, n_boot=3000, ci=95, rng=None):
+    y = np.asarray(y, float)
+    y = y[~np.isnan(y)]
+    if y.size == 0:
+        return np.nan, np.nan, np.nan
+    m = float(np.mean(y))
+    if y.size == 1:
+        return m, m, m
+    if rng is None:
+        rng = np.random.default_rng()  # fresh entropy
+    idx = rng.integers(0, y.size, size=(n_boot, y.size))
+    boots = np.mean(y[idx], axis=1)
+    lo = float(np.percentile(boots, (100 - ci) / 2))
+    hi = float(np.percentile(boots, 100 - (100 - ci) / 2))
+    return m, lo, hi
+
+
+def summarize_methods_across_tasks(
+    df_plot, ycol, method_col="dFC method", ci_func=mean_ci_boot
+):
+    """
+    Return a DataFrame with columns: [method_col, 'mean','lo','hi'].
+    Robust to Pandas quirks; no MultiIndex/unnamed columns.
+    Assumes df_plot has one row per (task, method) already (your BEST table).
+    """
+    rows = []
+    for meth, s in df_plot.groupby(method_col, observed=True)[ycol]:
+        m, lo, hi = ci_func(s.values)
+        rows.append({method_col: meth, "mean": m, "lo": lo, "hi": hi})
+    return pd.DataFrame(rows)
+
+
+def overlay_method_mean_ci(
+    ax,
+    df_plot,
+    ycol,
+    method_col="dFC method",
+    line_halfwidth=0.30,
+    cap_halfwidth=0.12,
+    color="#222",
+    lower=None,
+    upper=None,
+    rng=None,
+):
+    # map x positions from current ticks (call after you set/rotate xticklabels)
+    xticks = ax.get_xticks()
+    xlabs = [t.get_text() for t in ax.get_xticklabels()]
+    xpos = {lab: xticks[i] for i, lab in enumerate(xlabs)}
+
+    # summarize robustly
+    summ = summarize_methods_across_tasks(
+        df_plot, ycol, method_col, ci_func=lambda y: mean_ci_boot(y, rng=rng)
+    )
+
+    # clip to metric bounds if provided
+    def clip(v):
+        if lower is not None:
+            v = max(lower, v)
+        if upper is not None:
+            v = min(upper, v)
+        return v
+
+    for _, r in summ.iterrows():
+        meth = r[method_col]
+        if meth not in xpos or np.isnan(r["mean"]):
+            continue
+        x = xpos[meth]
+        m = clip(r["mean"])
+        lo = clip(r["lo"]) if not np.isnan(r["lo"]) else m
+        hi = clip(r["hi"]) if not np.isnan(r["hi"]) else m
+
+        # mean line (thick) + CI whisker & caps (thin)
+        ax.hlines(
+            m, x - line_halfwidth, x + line_halfwidth, colors=color, lw=2.6, zorder=6
+        )
+        ax.vlines(x, lo, hi, colors=color, lw=1.2, alpha=0.9, zorder=5)
+        ax.hlines(
+            [lo, hi],
+            x - cap_halfwidth,
+            x + cap_halfwidth,
+            colors=color,
+            lw=1.2,
+            alpha=0.9,
+            zorder=5,
+        )
+
+
+def wrap_domain(dom: str, max_len: int = 20) -> str:
+    # First, break on the preferred delimiters
+    s = dom.replace(" & ", " &\n").replace(", ", ",\n")
+    out = []
+    for seg in s.splitlines():
+        # Then wrap remaining long segments on spaces (no hard splits)
+        wrapped = textwrap.wrap(
+            seg, width=max_len, break_long_words=False, break_on_hyphens=True
+        )
+        out.extend(wrapped if wrapped else [""])
+    return "\n".join(out)
+
+
+def add_domains_between_ylabel_and_ticks(
+    ax,
+    row_order,
+    task_to_domain,
+    label_rotation=30,
+    tick_pad_pts=28,
+    ylabel_pad_pts=60,
+    domain_x_frac=-0.11,  # x position for the domain column (axes frac)
+    left_extend_frac=0.02,  # how far past the text the line extends
+    label_x_offset_frac=0.008,  # small nudge right from domain_x_frac
+    label_align="left",  # "left" | "center" | "right"
+    label_kw=None,
+    sep_kw=None,
+):
+    if label_kw is None:
+        label_kw = dict(
+            fontsize=10, fontweight="bold", color="#222", ha="left", va="center"
+        )  # default to left
+    else:
+        # override HA with requested alignment but keep user's other styles
+        label_kw = {
+            **label_kw,
+            "ha": {"left": "left", "center": "center", "right": "right"}[label_align],
+            "va": "center",
+        }
+    if sep_kw is None:
+        sep_kw = dict(color="#777", lw=1.0, alpha=0.9)
+
+    if not row_order:
+        return
+
+    ax.tick_params(axis="y", pad=tick_pad_pts)
+    ax.yaxis.labelpad = ylabel_pad_pts
+
+    # row centers (as before) ...
+    yticks = ax.get_yticks()
+    yticklabs = [t.get_text() for t in ax.get_yticklabels()]
+    if yticklabs and len(yticklabs) == len(row_order):
+        lbl2y = {lab: y for lab, y in zip(yticklabs, yticks)}
+        y_centers = [lbl2y.get(t, np.nan) for t in row_order]
+    else:
+        n = len(row_order)
+        y0, y1 = ax.get_ylim()
+        base = np.linspace(0.5, n - 0.5, n)
+        if y1 < y0:
+            scale = (y0 - y1) / (n - 1)
+            y_centers = y0 - (base - 0.5) * scale
+        else:
+            scale = (y1 - y0) / (n - 1)
+            y_centers = y0 + (base - 0.5) * scale
+
+    doms = [task_to_domain.get(t, "Other") for t in row_order]
+    blocks = []
+    start = 0
+    for i in range(1, len(doms)):
+        if doms[i] != doms[i - 1]:
+            blocks.append((doms[start], start, i - 1))
+            start = i
+    if len(doms):
+        blocks.append((doms[start], start, len(doms) - 1))
+
+    trans_text = mtransforms.blended_transform_factory(ax.transAxes, ax.transData)
+    for dom, i0, i1 in blocks:
+        y_block = float(np.nanmean(y_centers[i0 : i1 + 1]))
+        # left-aligned text slightly to the right of the domain column anchor
+        x_text = domain_x_frac + (label_x_offset_frac if label_align == "left" else 0.0)
+        dom_updated = wrap_domain(dom, max_len=24)
+        ax.text(
+            x_text,
+            y_block,
+            dom_updated,
+            rotation=label_rotation,
+            transform=trans_text,
+            clip_on=False,
+            **label_kw,
+        )
+
+    # separators (heatmap + extension into the domain column)
+    x_min, x_max = ax.get_xlim()
+    trans_sep = mtransforms.blended_transform_factory(ax.transAxes, ax.transData)
+    for i in range(len(doms) - 1):
+        if doms[i + 1] != doms[i]:
+            y_sep = 0.5 * (y_centers[i] + y_centers[i + 1])
+            ax.hlines(y_sep, x_min, x_max, **sep_kw)  # inside heatmap
+            ax.plot(
+                [0.0, domain_x_frac - left_extend_frac],
+                [y_sep, y_sep],
+                transform=trans_sep,
+                clip_on=False,
+                **sep_kw,
+            )  # into domain column
+
+
+###################### task_timing_stats ######################
+
+
+def as_long_df(d, value_col, task_col="task"):
+    rows = []
+    for t, vals in d.items():
+        for v in vals:
+            rows.append({task_col: t, value_col: v})
+    return pd.DataFrame(rows)
+
+
+# --- median labels with matching hue colors (log-safe) ---
+def annotate_medians_by_geometry(
+    ax,
+    df_long,
+    x_col,
+    hue_col,
+    y_col,
+    x_order,
+    hue_order,
+    fmt="{:.0f}",  # ints; change to "{:.2g}" if you prefer
+    y_nudge_factor=1.08,
+    bin_halfwidth=0.6,
+    bbox_alpha=0.9,
+):
+    def _luminance(r, g, b):
+        # simple relative luminance for contrast
+        return 0.299 * r + 0.587 * g + 0.114 * b
+
+    # collect box patches and centers
+    patches = [
+        p for p in getattr(ax, "artists", []) if isinstance(p, mpl.patches.PathPatch)
+    ]
+    if not patches:
+        patches = [p for p in ax.patches if isinstance(p, mpl.patches.PathPatch)]
+
+    boxes = []
+    for p in patches:
+        verts = p.get_path().vertices
+        xs = verts[:, 0]
+        x_center = 0.5 * (xs.min() + xs.max())
+        boxes.append((x_center, p))
+
+    if not boxes:
+        return
+
+    # bin by x tick index (0..len(x_order)-1)
+    boxes_by_tick = {i: [] for i in range(len(x_order))}
+    for x_center, p in boxes:
+        idx = int(round(x_center))
+        if idx in boxes_by_tick and abs(x_center - idx) <= bin_halfwidth:
+            boxes_by_tick[idx].append((x_center, p))
+
+    # medians from data
+    med_dict = df_long.groupby([x_col, hue_col])[y_col].median().to_dict()
+
+    for i, task in enumerate(x_order):
+        group = boxes_by_tick.get(i, [])
+        if not group:
+            continue
+        # left->right inside this task bin
+        group.sort(key=lambda t: t[0])
+
+        for j, hue in enumerate(hue_order):
+            if j >= len(group):
+                break
+            x_center, patch = group[j]
+            med = med_dict.get((task, hue), np.nan)
+            if not (np.isfinite(med) and med > 0):
+                continue
+
+            # extract the exact facecolor of this box (matches legend/palette)
+            fc = patch.get_facecolor()  # RGBA
+            if fc is None or len(fc) < 3:
+                # fallback (rare): use current color cycle
+                fc = ax._get_lines.get_next_color()
+                # normalize to RGBA
+                fc = mpl.colors.to_rgba(fc)
+
+            r, g, b, a = fc
+            # adjust alpha for the textbox so it’s legible
+            fc_box = (r, g, b, bbox_alpha)
+
+            # choose black/white text for contrast
+            txt_color = "black" if _luminance(r, g, b) > 0.6 else "white"
+
+            ax.text(
+                x_center,
+                med * y_nudge_factor,
+                fmt.format(med),
+                ha="center",
+                va="center",
+                fontsize=9,
+                fontweight="bold",
+                color=txt_color,
+                bbox=dict(boxstyle="round,pad=0.2", fc=fc_box, ec="none"),
+                zorder=100,
+                clip_on=False,
+            )
+
+
+# ---------- helpers: median ordering + median labeler (single-category boxplot) ----------
+def order_by_median_dict(d, reverse=True):
+    """Return (ordered_task_names, stats_dict) where stats_dict[task]=(median, std)."""
+    stats = {t: (np.median(vals), np.std(vals)) for t, vals in d.items() if len(vals) > 0}
+    ordered = sorted(stats.keys(), key=lambda t: stats[t][0], reverse=reverse)
+    return ordered, stats
+
+
+def annotate_medians_single_boxplot(
+    ax, df_long, x_col, y_col, order, fmt="{:.2f}", box_alpha=0.90
+):
+    """
+    Annotate the median for each category on a seaborn.boxplot *without hue*.
+    Places the number at the geometric center of each box, using the box facecolor for the label bg.
+    Call this AFTER setting any y-limits (so the nudge uses final limits).
+    """
+
+    # compute medians in plotting order
+    med = df_long.groupby(x_col)[y_col].median().reindex(order)
+
+    # collect PathPatches for boxes (artists in most seaborn versions; fallback to patches)
+    patches = [
+        p for p in getattr(ax, "artists", []) if isinstance(p, mpl.patches.PathPatch)
+    ]
+    if not patches:
+        patches = [p for p in ax.patches if isinstance(p, mpl.patches.PathPatch)]
+
+    n = min(len(patches), len(order))
+    ymin, ymax = ax.get_ylim()
+    dy = 0.02 * (ymax - ymin)  # small additive nudge in data units
+
+    for k in range(n):
+        patch = patches[k]
+        verts = patch.get_path().vertices
+        xs, _ = verts[:, 0], verts[:, 1]
+        x_center = 0.5 * (xs.min() + xs.max())
+
+        m = med.iloc[k]
+        if not np.isfinite(m):
+            continue
+
+        # label background color = box facecolor (match legend/palette)
+        fc = patch.get_facecolor()
+        if fc is None or len(fc) < 3:
+            fc = mpl.colors.to_rgba("white", box_alpha)
+        else:
+            fc = (fc[0], fc[1], fc[2], box_alpha)
+
+        # text color for contrast (simple luminance check)
+        lum = 0.299 * fc[0] + 0.587 * fc[1] + 0.114 * fc[2]
+        txt_color = "black" if lum > 0.6 else "white"
+
+        # keep label inside the axis (avoid hitting the top bound)
+        y_text = min(m + dy, ymax - 0.01 * (ymax - ymin))
+
+        ax.text(
+            x_center,
+            y_text,
+            fmt.format(m),
+            ha="center",
+            va="center",
+            fontsize=9,
+            fontweight="bold",
+            color=txt_color,
+            bbox=dict(boxstyle="round,pad=0.2", fc=fc, ec="none"),
+            zorder=100,
+            clip_on=False,
+        )
+
+
+###################### task_presence_binarization ######################
+
+###################### dfc_visualization ######################
+
+
+def _window_indices(
+    trs, window_len=8, center="middle", center_time=None, center_index=None, interval=None
+):
+    T = len(trs)
+    trs = np.asarray(trs)
+    if interval is not None:
+        t0, t1 = interval
+        idxs = np.where((trs >= t0) & (trs <= t1))[0]
+        if len(idxs) == 0:
+            raise ValueError("interval produced no indices; check units.")
+        return idxs
+    if center_index is not None:
+        c = int(np.clip(center_index, 0, T - 1))
+    elif center_time is not None:
+        c = int(np.argmin(np.abs(trs - center_time)))
+    else:
+        c = (T - 1) // 2
+    half = window_len // 2
+    start = max(0, c - half)
+    end = min(T, start + window_len)
+    start = max(0, end - window_len)
+    return np.arange(start, end, dtype=int)
+
+
+def _common_limits(dfc_dict, robust_percentile=(2, 98), symmetric=True):
+    vals = []
+    for A in dfc_dict.values():
+        R = A.shape[1]
+        iu = np.triu_indices(R, 1)
+        vals.append(A[:, iu[0], iu[1]].ravel())
+    lo, hi = np.percentile(np.concatenate(vals), robust_percentile)
+    if symmetric:
+        m = max(abs(lo), abs(hi))
+        return -m, m
+    return lo, hi
+
+
+def figure_dfc_matrices_window_png(
+    dfc_dict,
+    trs,
+    window_len=8,
+    center="middle",
+    center_time=None,
+    center_index=None,
+    interval=None,
+    cmap="coolwarm",
+    outfile="fig_dfc_window.png",
+    show_region_ticks=False,
+    region_labels=None,
+    draw_network_bounds=None,
+    dpi=600,
+    transparent=False,
+    # style knobs
+    method_label_size=11,
+    tr_label_size=10,
+    cbar_label_size=11,
+    rotate_method_labels=90,
+    method_label_pad=18,  # << controls distance between method names and images
+    wspace=None,  # << override column spacing if needed (None = auto)
+):
+    import matplotlib as mpl
+    import matplotlib.pyplot as plt
+    import numpy as np
+    from matplotlib import gridspec
+
+    mpl.rcParams.update(
+        {
+            "figure.dpi": dpi,
+            "savefig.dpi": dpi,
+            "pdf.fonttype": 42,
+            "ps.fonttype": 42,
+            "font.size": 8,
+            "axes.titlesize": tr_label_size,
+            "axes.labelsize": method_label_size,
+        }
+    )
+
+    methods = list(dfc_dict.keys())
+    R = next(iter(dfc_dict.values())).shape[1]
+
+    idxs = _window_indices(
+        trs,
+        window_len=window_len,
+        center=center,
+        center_time=center_time,
+        center_index=center_index,
+        interval=interval,
+    )
+
+    vmin, vmax = _common_limits(dfc_dict, robust_percentile=(2, 98), symmetric=True)
+    vmin = 0
+
+    # figure sizing
+    col_width = 1.6
+    row_height = 1.5
+    nrows, ncols = len(methods), len(idxs)
+
+    fig = plt.figure(figsize=((ncols + 0.5) * col_width, nrows * row_height))
+
+    # spacing
+    auto_wspace = min(0.35, 0.12 + 0.01 * ncols)
+    wspace = auto_wspace if wspace is None else wspace
+    hspace = 0.25
+
+    # add a dedicated colorbar column on the far right
+    gs = gridspec.GridSpec(
+        nrows,
+        ncols + 1,
+        width_ratios=[1] * ncols + [0.06],  # last slot = colorbar
+        hspace=hspace,
+        wspace=wspace,
+    )
+
+    last_im = None
+    for r, m in enumerate(methods):
+        A = dfc_dict[m]
+        for c, t_idx in enumerate(idxs):
+            ax = fig.add_subplot(gs[r, c])
+            M = A[t_idx].copy()
+            np.fill_diagonal(M, np.nan)
+            im = ax.imshow(M, vmin=vmin, vmax=vmax, cmap=cmap, interpolation="none")
+            last_im = im
+
+            if draw_network_bounds:
+                for b in draw_network_bounds:
+                    ax.axhline(b - 0.5, linewidth=0.4, color="k")
+                    ax.axvline(b - 0.5, linewidth=0.4, color="k")
+
+            if show_region_ticks and region_labels is not None:
+                step = max(1, R // 16)
+                ticks = np.arange(0, R, step)
+                ax.set_xticks(ticks)
+                ax.set_yticks(ticks)
+                ax.set_xticklabels(
+                    [region_labels[i] for i in ticks], rotation=90, fontsize=6
+                )
+                ax.set_yticklabels([region_labels[i] for i in ticks], fontsize=6)
+            else:
+                ax.set_xticks([])
+                ax.set_yticks([])
+            for s in ax.spines.values():
+                s.set_visible(False)
+
+            if r == 0:
+                label = (
+                    f"TR{trs[t_idx]}"
+                    if np.issubdtype(np.asarray(trs).dtype, np.number)
+                    else str(trs[t_idx])
+                )
+                ax.set_title(label, pad=6, fontsize=tr_label_size, fontweight="bold")
+
+            if c == 0:
+                ax.set_ylabel(
+                    m,
+                    rotation=rotate_method_labels,
+                    labelpad=method_label_pad,  # << tighten/loosen here
+                    va="center",
+                    ha="center",
+                    fontsize=method_label_size,
+                    fontweight="bold",
+                )
+                ax.yaxis.set_label_position("left")
+
+    # colorbar in its own axis (no overlap)
+    cax = fig.add_subplot(gs[:, -1])
+    cbar = fig.colorbar(last_im, cax=cax)
+    cbar.set_label("Connectivity", fontsize=cbar_label_size, fontweight="bold")
+    cbar.ax.tick_params(labelsize=max(8, cbar_label_size - 1))
+
+    fig.subplots_adjust(left=0.10, right=0.98, top=0.95, bottom=0.05)
+    fig.savefig(
+        outfile,
+        bbox_inches="tight",
+        pad_inches=0.02,
+        transparent=transparent,
+        facecolor="white",
+    )
+    plt.close(fig)
+    print(
+        f"Saved {outfile}  |  TR columns: {len(idxs)}  |  vmin={vmin:.3f}, vmax={vmax:.3f}  |  dpi={dpi}"
+    )
diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
new file mode 100644
index 0000000..e849704
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -0,0 +1,544 @@
+import argparse
+import json
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.colors import to_rgba
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    add_domains_between_ylabel_and_ticks,
+    boldify_axes,
+    build_task_order_and_palette,
+    domain_sorted_rows,
+    draw_grouped_legend_panel,
+    get_cog_domain_info,
+    savefig_pub,
+    setup_pub_style,
+    task_domain_real,
+    task_domain_simul,
+)
+
+level = "group_lvl"
+keys_not_to_include = [
+    "Logistic regression permutation p_value",
+    "Logistic regression permutation score mean",
+    "Logistic regression permutation score std",
+    "SVM permutation p_value",
+    "SVM permutation score mean",
+    "SVM permutation score std",
+]
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to make figures/tables from multi-dataset ML results.
+    """
+
+    setup_pub_style()
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+
+    output_root = f"{multi_dataset_info['output_root']}/ML_results"
+
+    ALL_ML_SCORES = None
+    for dataset in DATASETS:
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        ML_root = f"{main_root}/{dataset}/derivatives/ML"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        # find all ML_scores_classify_dFC-id.npy in the ML_root/classfication/ folder
+        # for now we will only use the first session
+        session = SESSIONS[0]
+        if session is None:
+            input_dir = f"{ML_root}/classification"
+        else:
+            input_dir = f"{ML_root}/classification/{session}"
+        if not os.path.exists(input_dir):
+            print(
+                f"Input directory {input_dir} does not exist. Skipping dataset {dataset}."
+            )
+            continue
+        ALL_ML_SCORES_FILES = os.listdir(input_dir)
+        ALL_ML_SCORES_FILES = [
+            f for f in ALL_ML_SCORES_FILES if "ML_scores_classify_" in f
+        ]
+        for f in ALL_ML_SCORES_FILES:
+            try:
+                ML_scores_new = np.load(f"{input_dir}/{f}", allow_pickle=True).item()
+                # ML_scores_new_updated is a new dictionary with same keys as ML_scores_new but empty lists
+                ML_scores_new_updated = {
+                    key: []
+                    for key in ML_scores_new[level].keys()
+                    if key not in keys_not_to_include
+                }
+                for task in TASKS:
+                    if task not in TASKS_to_include:
+                        continue
+                    if task not in ML_scores_new[level]["task"]:
+                        dFC_method = set(ML_scores_new[level]["dFC method"])
+                        print(f"Task {task} not in ML_scores of {dFC_method}. Skipping.")
+                        continue
+                    for i in range(len(ML_scores_new[level]["task"])):
+                        for key in ML_scores_new_updated.keys():
+                            ML_scores_new_updated[key].append(
+                                ML_scores_new[level][key][i]
+                            )
+
+                if ALL_ML_SCORES is None:
+                    ALL_ML_SCORES = ML_scores_new_updated
+                else:
+                    for key in ML_scores_new_updated.keys():
+                        if key in ALL_ML_SCORES:
+                            ALL_ML_SCORES[key].extend(ML_scores_new_updated[key])
+            except Exception as e:
+                print(f"Error loading {f}: {e}")
+                continue
+
+    # check that the lists in all keys have the same length
+    if ALL_ML_SCORES is not None:
+        lengths = [len(v) for v in ALL_ML_SCORES.values()]
+        if len(set(lengths)) != 1:
+            print(
+                f"Warning: Not all keys have the same length in ALL_ML_SCORES. key and length pairs: {dict(zip(ALL_ML_SCORES.keys(), lengths))}"
+            )
+
+    # save ALL_ML_SCORES
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+    np.save(f"{output_root}/ALL_ML_SCORES_{simul_or_real}.npy", ALL_ML_SCORES)
+
+    # ===== Plotting =====
+    DOMAIN_ORDER, TASK2DOMAIN, DOMAIN_BASE = get_cog_domain_info(simul_or_real)
+    # knobs
+    GROUP = "test"
+    TARGETS = [
+        ("PCA", "Logistic regression balanced accuracy"),
+        ("LE", "Logistic regression balanced accuracy"),
+        ("PCA", "SVM balanced accuracy"),
+        ("LE", "SVM balanced accuracy"),
+        ("LE", "SI"),
+        ("PCA", "SI"),
+    ]
+    # -------------------------------------------------------------------
+
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})
+    sns.set_style("darkgrid")
+
+    AGG_FUNC = np.median  # across-run aggregation
+
+    for embedding, metric in TARGETS:
+        # ---- slice ----
+        df = pd.DataFrame.from_dict(ALL_ML_SCORES)
+        df = df[df["task"].isin(TASKS_to_include)]
+        df = df[(df["embedding"] == embedding) & (df["group"] == GROUP)]
+
+        # methods in alphabetical order (your current rule)
+        method_order = sorted(df["dFC method"].unique(), key=lambda s: s.lower())
+        df["dFC method"] = pd.Categorical(
+            df["dFC method"], categories=method_order, ordered=True
+        )
+
+        # --- domain tagging & task ordering/coloring (only for real data) ---
+        if simul_or_real == "real":
+            df["domain"] = df["task"].map(task_domain_real)
+        elif simul_or_real == "simul":
+            df["domain"] = df["task"].map(task_domain_simul)
+        # Use tasks present in THIS slice
+        task_order, task_palette = build_task_order_and_palette(
+            df["task"].unique(),
+            simul_or_real=simul_or_real,
+            similarity_L=0.05,
+            similarity_S=0.04,
+        )
+
+        # ===== build BEST and ACROSS tables =====
+        counts_task = df.groupby("task")["run"].nunique()
+        multi_tasks = counts_task[counts_task > 1].index
+        df_multi = df[
+            df["task"].isin(multi_tasks)
+        ]  # <- use this dataframe for ACROSS figures
+
+        # ACROSS heatmap (aggregate then pivot):
+        if not df_multi.empty:
+            df_across = (
+                df_multi.groupby(["task", "dFC method"], observed=True)[metric]
+                .agg(score=AGG_FUNC)
+                .reset_index()
+            )
+
+        # BEST: one row per (task, method) with the winning run kept
+        df_best = (
+            df.sort_values(["task", "dFC method", metric], ascending=[True, True, False])
+            .drop_duplicates(subset=["task", "dFC method"], keep="first")
+            .rename(columns={metric: "score"})
+        )
+
+        # ----------- POINTPLOT (BEST) -----------
+        # 1) Make a 2-panel figure: left=plot, right=legend
+        fig = plt.figure(figsize=(max(10, 0.6 * len(method_order)) + 5.0, 7.0))
+        gs = fig.add_gridspec(ncols=2, nrows=1, width_ratios=[1.0, 0.5], wspace=0.05)
+        ax = fig.add_subplot(gs[0, 0])
+        ax_leg = fig.add_subplot(gs[0, 1])  # empty panel for the legend
+
+        # --- BACKGROUND: semi-transparent boxplot across tasks (per method) ---
+        # one value per (task, method): use df_best['score']
+        box_face = to_rgba("#DE9995", 0.18)  # neutral gray, ~18% opacity
+        box_edge = "#730800"
+
+        sns.boxplot(
+            data=df_best,
+            x="dFC method",
+            y="score",
+            order=method_order,
+            whis=(
+                5,
+                95,
+            ),  # <- 5th–95th percentile whiskers (change to "range", 1.5, etc. if you prefer)
+            fliersize=0,  # hide outlier dots (keeps background clean)
+            linewidth=1.0,
+            width=0.2,  # narrower than default so points are visible
+            color=box_face,  # face color (we’ll also set edge color below)
+            ax=ax,
+            zorder=1,
+        )
+        # ensure edges are visible but subtle; also enforce alpha on faces
+        for artist in ax.artists:
+            artist.set_edgecolor(box_edge)
+            fc = artist.get_facecolor()
+            artist.set_facecolor((fc[0], fc[1], fc[2], 0.12))  # set alpha explicitly
+        for line in ax.lines:  # whiskers/medians/caps
+            line.set_color(box_edge)
+            line.set_alpha(0.5)
+            line.set_zorder(1)
+
+        # --- OVERLAY: method mean across tasks (black horizontal line) ---
+        # (This is separate from the boxplot's median; gives an easy mean comparison)
+        means = df_best.groupby("dFC method", observed=True)["score"].mean()
+        xticks = ax.get_xticks()
+        xlabs = [t.get_text() for t in ax.get_xticklabels()]
+        xpos = {lab: xticks[i] for i, lab in enumerate(xlabs)}
+
+        # bounds (SI vs BA)
+        if metric == "SI":
+            lower, upper = -1.0, 1.0
+        else:
+            lower, upper = 0.5, 1.0
+
+        halfwidth = 0.1  # how wide the mean bar is around each tick
+        for meth, m in means.items():
+            if meth in xpos and pd.notna(m):
+                m = min(upper, max(lower, m))  # clip to metric range
+                x = xpos[meth]
+                ax.hlines(
+                    m, x - halfwidth, x + halfwidth, colors="#050505", lw=2.4, zorder=3
+                )
+
+        # --- FOREGROUND: your existing per-task pointplot (on top) ---
+        sns.pointplot(
+            data=df_best,
+            x="dFC method",
+            y="score",
+            hue="task",
+            order=method_order,
+            hue_order=task_order,
+            dodge=0.4,
+            errorbar=None,
+            linestyles="",
+            markers="o",
+            palette=task_palette,
+            ax=ax,
+            zorder=6,
+        )
+
+        # optional: crisp marker edges
+        for line in ax.lines:
+            try:
+                line.set_markeredgecolor("#222222")
+                line.set_markeredgewidth(0.8)
+            except Exception:
+                pass
+
+        ax.set_xlabel("dFC method")
+        ax.set_ylabel(metric)
+        if metric == "SI":
+            ax.set_ylim(top=1.02)
+        else:
+            ax.set_ylim(0.48, 1.02)
+        ax.grid(True, axis="y", alpha=0.25)
+        sns.despine(ax=ax, top=True, right=True)
+        plt.setp(ax.get_xticklabels(), rotation=35, ha="right")
+
+        # kill any in-axes legend and draw grouped legend in the right panel
+        if ax.legend_:
+            ax.legend_.remove()
+        if simul_or_real == "real":
+            domain_of = {t: task_domain_real(t) for t in task_order}
+            draw_grouped_legend_panel(
+                ax_leg,
+                task_order,
+                domain_of,
+                task_palette,
+                DOMAIN_ORDER,
+                ncols=2,
+                fontsize=8,
+                markersize=5,
+            )
+            ax_leg.set_title("Task Paradigm", fontsize=9, pad=4, fontweight="bold")
+        elif simul_or_real == "simul":
+            domain_of = {t: task_domain_simul(t) for t in task_order}
+            draw_grouped_legend_panel(
+                ax_leg,
+                task_order,
+                domain_of,
+                task_palette,
+                DOMAIN_ORDER,
+                ncols=1,
+                fontsize=8,
+                markersize=5,
+            )
+            ax_leg.set_title("Task Paradigm", fontsize=9, pad=4, fontweight="bold")
+
+        box = ax_leg.get_position()
+        ax_leg.set_position(
+            [box.x0, box.y0 - 0.03, box.width, box.height]
+        )  # move down by ~3% fig height
+
+        boldify_axes(ax, xlabel="dFC method", ylabel=metric)
+
+        # IMPORTANT: don't call a plain tight_layout() now; the GridSpec already allocates space.
+        # If you must, keep a small margin:
+        fig.tight_layout(rect=[0.02, 0.02, 0.98, 0.98])
+
+        savefig_pub(
+            f"{output_root}/ML_scores_{embedding}_{metric}_{level}_{simul_or_real}_best.png"
+        )
+        plt.close(fig)
+
+        # ----------- HEATMAPS -----------
+        # BEST heatmap: values from df_best
+        mat_best = df_best.pivot(index="task", columns="dFC method", values="score")
+        row_order = domain_sorted_rows(mat_best.index, TASKS_to_include, simul_or_real)
+        col_order = [m for m in method_order if m in mat_best.columns]
+
+        annot_best = df_best.assign(
+            label=lambda x: x["score"].map(lambda v: f"{v:.2f}")
+        ).pivot(index="task", columns="dFC method", values="label")
+
+        if simul_or_real == "real":
+            w = max(10, 0.65 * len(col_order))
+            h = max(6.0, 0.30 * len(row_order))
+        else:
+            w = max(11, 11 / 7 * len(col_order))
+            h = max(7.0, 0.35 * len(row_order))
+        fig, ax = plt.subplots(figsize=(w, h))
+        vmin, vmax, center = (
+            (None, 1.0, 0.0) if metric == "SI" else (0.5 - 1e-6, 1.0, 0.5)
+        )
+        hm = sns.heatmap(
+            mat_best.loc[row_order, col_order],
+            vmin=vmin,
+            vmax=vmax,
+            center=center,
+            cmap="coolwarm",
+            annot=annot_best.loc[row_order, col_order],
+            fmt="",
+            annot_kws={"fontsize": 9, "fontweight": "bold", "linespacing": 1.15},
+            cbar_kws={"shrink": 0.7, "pad": 0.02},
+            ax=ax,
+        )
+        cbar = hm.collections[0].colorbar
+        cbar.set_label(metric, fontsize=10)
+        cbar.ax.tick_params(labelsize=9)
+        boldify_axes(ax, xlabel="dFC method", ylabel="Task Paradigm", rotate_xticks=35)
+
+        if simul_or_real == "real":
+            task_to_domain = {
+                t: task_domain_real(t) for t in row_order
+            }  # your task_domain helper
+            domain_x_frac = -0.8
+            ylabel_pad_pts = 130
+        elif simul_or_real == "simul":
+            task_to_domain = {
+                t: task_domain_simul(t) for t in row_order
+            }  # your task_domain helper
+            domain_x_frac = -1.0
+            ylabel_pad_pts = 110
+        add_domains_between_ylabel_and_ticks(
+            ax,
+            row_order=row_order,
+            task_to_domain=task_to_domain,
+            label_rotation=0,  # try 0, 20, 30, or 45
+            tick_pad_pts=0,  # pushes tick labels to the right
+            ylabel_pad_pts=ylabel_pad_pts,  # moves y-axis label left
+            domain_x_frac=domain_x_frac,  # where domain column sits (more negative = further left)
+            left_extend_frac=0.01,  # extend the line a bit further left than the text
+            label_x_offset_frac=0.010,  # nudge text right from the anchor
+            label_align="left",  # <<< left-align labels
+            label_kw=dict(
+                fontsize=9, fontweight="bold", color="#222", ha="center", va="center"
+            ),
+            sep_kw=dict(color="#777", lw=1.0, alpha=0.9),
+        )
+
+        # Bold colorbar label (metric name) too:
+        cbar.set_label(metric, fontsize=10, fontweight="bold")
+
+        ax.set_xlabel("dFC method")
+        ax.set_ylabel("Task Paradigm")
+        # ax.set_title(f"Best across runs • {embedding} • {metric}", pad=8)
+        ax.tick_params(axis="x", labelrotation=35, labelsize=9)
+        plt.setp(ax.get_xticklabels(), fontweight="bold", rotation=35, ha="right")
+        plt.setp(ax.get_yticklabels(), fontweight="bold")
+        sns.despine(ax=ax, top=True, right=True)
+        plt.tight_layout()
+        savefig_pub(
+            f"{output_root}/ML_scores_heatmap_{embedding}_{metric}_{level}_{simul_or_real}_best.png"
+        )
+        plt.close(fig)
+
+        # ACROSS heatmap: color = median; annotation = Stability & n (ALWAYS range-based)
+        if df_multi.empty:
+            print(
+                f"[ACROSS-RUN] No tasks with ≥2 runs for {embedding} / {metric} — skipping across-run figures."
+            )
+        else:
+            # aggregate across runs
+            s = (
+                df_multi.groupby(["task", "dFC method"], observed=True)[metric]
+                .agg(n="count", med="median", vmin="min", vmax="max")
+                .reset_index()
+            )
+
+            # metric bounds & heatmap scaling
+            if metric == "SI":
+                rng = 2.0  # SI in [-1, 1]
+                vmin, vmax, center = None, 1.0, 0.0
+            else:
+                rng = 1.0  # accuracies in [0, 1]
+                vmin, vmax, center = 0.5 - 1e-6, 1.0, 0.5
+
+            # ALWAYS: range-based stability
+            s["stability"] = (1.0 - ((s["vmax"] - s["vmin"]) / rng)).clip(0.0, 1.0)
+
+            # pivots
+            mat_across = s.pivot(index="task", columns="dFC method", values="med")
+            ann_text = s.assign(
+                label=lambda d: d["stability"].map(lambda v: f"{v:.2f}")
+                + "\n"
+                + d["n"].map(lambda n: f"n={n}")
+            ).pivot(index="task", columns="dFC method", values="label")
+
+            # order
+            row_order = domain_sorted_rows(
+                mat_across.index, TASKS_to_include, simul_or_real
+            )
+            col_order = [m for m in method_order if m in mat_across.columns]
+
+            # plot
+            w = max(9.0, 9 / 7 * len(col_order))
+            h = max(7.0, 7 / 20 * len(row_order))
+            fig, ax = plt.subplots(figsize=(w, h))
+            hm = sns.heatmap(
+                mat_across.loc[row_order, col_order],
+                vmin=vmin,
+                vmax=vmax,
+                center=center,
+                cmap="coolwarm",
+                annot=ann_text.loc[row_order, col_order],
+                fmt="",
+                annot_kws={"fontsize": 9, "fontweight": "bold", "linespacing": 1.15},
+                cbar_kws={"shrink": 0.7, "pad": 0.02},
+                ax=ax,
+            )
+
+            # domain sidebar & separators (your helper)
+            if simul_or_real == "real":
+                task_to_domain = {t: task_domain_real(t) for t in row_order}
+                domain_x_frac = -0.8
+                ylabel_pad_pts = 130
+            elif simul_or_real == "simul":
+                task_to_domain = {t: task_domain_simul(t) for t in row_order}
+                domain_x_frac = -1.0
+                ylabel_pad_pts = 110
+            add_domains_between_ylabel_and_ticks(
+                ax,
+                row_order=row_order,
+                task_to_domain=task_to_domain,
+                label_rotation=0,
+                tick_pad_pts=0,
+                ylabel_pad_pts=ylabel_pad_pts,
+                domain_x_frac=domain_x_frac,
+                left_extend_frac=0.01,
+                label_x_offset_frac=0.010,
+                label_align="left",
+                label_kw=dict(
+                    fontsize=9, fontweight="bold", color="#222", ha="center", va="center"
+                ),
+                sep_kw=dict(color="#777", lw=1.0, alpha=0.9),
+            )
+
+            # cosmetics
+            cbar = hm.collections[0].colorbar
+            cbar.set_label(metric, fontsize=10, fontweight="bold")
+            boldify_axes(
+                ax, xlabel="dFC method", ylabel="Task Paradigm", rotate_xticks=35
+            )
+            plt.setp(ax.get_xticklabels(), fontweight="bold", rotation=35, ha="right")
+            plt.setp(ax.get_yticklabels(), fontweight="bold")
+            sns.despine(ax=ax, top=True, right=True)
+            plt.tight_layout()
+            savefig_pub(
+                f"{output_root}/ML_scores_heatmap_{embedding}_{metric}_{level}_{simul_or_real}_across.png"
+            )
+            plt.close(fig)
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
new file mode 100644
index 0000000..04f6459
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -0,0 +1,476 @@
+import argparse
+import json
+import os
+
+import matplotlib.pyplot as plt
+import matplotlib.transforms as mtransforms
+import numpy as np
+from matplotlib.colors import ListedColormap
+from scipy.cluster.hierarchy import leaves_list, linkage
+from scipy.stats import ttest_ind
+
+from pydfc.ml_utils import (
+    dFC_feature_extraction,
+    embed_dFC_features,
+    find_available_subjects,
+    process_SB_features,
+)
+
+use_raw_features = False  # if True, use raw dFC features instead of embedded features
+normalize_dFC = True
+FCS_proba_for_SB = True
+train_test_ratio = 0.8
+embedding = "LE"
+
+if use_raw_features:
+    raw_or_embedded = "_raw"
+else:
+    raw_or_embedded = ""
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to visualize the feature-sample matrix for each dataset, task, and dFC measure.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+
+    output_root = f"{multi_dataset_info['output_root']}/feature-sample/{simul_or_real}"
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
+    for dataset in DATASETS:
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for dFC_id in range(0, 7):
+            DATA = {}
+            for session in SESSIONS[:1]:  # Only process the first session
+                for task_id, task in enumerate(TASKS):
+                    if task not in TASKS_to_include:
+                        print(f"Skipping task: {task} as it is not in TASKS_to_include.")
+                        continue
+                    for run in RUNS[task][:1]:  # Only process the first run
+                        print(
+                            f"Processing dataset: {dataset}, task: {task}, run: {run}, session: {session}, dFC_id: {dFC_id}"
+                        )
+
+                        SUBJECTS = find_available_subjects(
+                            dFC_root=dFC_root,
+                            task=task,
+                            run=run,
+                            session=session,
+                            dFC_id=dFC_id,
+                        )
+
+                        # randomly select train_test_ratio of the subjects for training
+                        # and rest for testing using numpy.random.choice
+                        train_subjects = np.random.choice(
+                            SUBJECTS, int(train_test_ratio * len(SUBJECTS)), replace=False
+                        )
+                        test_subjects = np.setdiff1d(SUBJECTS, train_subjects)
+                        print(
+                            f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
+                        )
+
+                        (
+                            X_train,
+                            X_test,
+                            y_train,
+                            y_test,
+                            subj_label_train,
+                            subj_label_test,
+                            measure_name,
+                        ) = dFC_feature_extraction(
+                            task=task,
+                            train_subjects=train_subjects,
+                            test_subjects=test_subjects,
+                            dFC_id=dFC_id,
+                            roi_root=roi_root,
+                            dFC_root=dFC_root,
+                            run=run,
+                            session=session,
+                            dynamic_pred="no",
+                            normalize_dFC=normalize_dFC,
+                            FCS_proba_for_SB=FCS_proba_for_SB,  # for state-based dFC features, we use FCS_proba
+                        )
+
+                        if measure_name is None:
+                            print(
+                                f"Skipping dataset: {dataset}, task: {task}, run: {run}, session: {session}, dFC_id: {dFC_id} due to no measure_name."
+                            )
+                            continue
+
+                        measure_is_state_based = None
+                        if measure_name in ["SlidingWindow", "Time-Freq"]:
+                            measure_is_state_based = False
+                        elif measure_name in [
+                            "CAP",
+                            "Clustering",
+                            "ContinuousHMM",
+                            "DiscreteHMM",
+                            "Windowless",
+                        ]:
+                            measure_is_state_based = True
+                        else:
+                            # raise error
+                            raise ValueError(f"Unknown measure name: {measure_name}")
+
+                        if measure_is_state_based:
+                            X_train_embedded = process_SB_features(
+                                X=X_train, measure_name=measure_name
+                            )
+                            X_test_embedded = process_SB_features(
+                                X=X_test, measure_name=measure_name
+                            )
+                        else:
+                            # embed dFC features
+                            try:
+                                X_train_embedded, X_test_embedded = embed_dFC_features(
+                                    train_subjects=train_subjects,
+                                    test_subjects=test_subjects,
+                                    X_train=X_train,
+                                    X_test=X_test,
+                                    y_train=y_train,
+                                    y_test=y_test,
+                                    subj_label_train=subj_label_train,
+                                    subj_label_test=subj_label_test,
+                                    embedding=embedding,
+                                    n_components="auto",
+                                    n_neighbors_LE=125,
+                                    LE_embedding_method="embed+procrustes",
+                                    measure_is_state_based=measure_is_state_based,
+                                )
+                                assert (
+                                    X_train_embedded.shape[0] == y_train.shape[0]
+                                ), "Number of samples do not match."
+                                assert (
+                                    X_test_embedded.shape[0] == y_test.shape[0]
+                                ), "Number of samples do not match."
+                            except Exception as e:
+                                print(
+                                    f"Error in embedding dFC features with {embedding}: {e}"
+                                )
+                                X_train_embedded = None
+                                X_test_embedded = None
+
+                        assert (
+                            task not in DATA
+                        ), f"Task {task} already exists in DATA. Overwriting."
+                        DATA[task] = {
+                            "X_train": X_train,
+                            "X_test": X_test,
+                            "X_train_embedded": X_train_embedded,
+                            "X_test_embedded": X_test_embedded,
+                            "y_train": y_train,
+                            "y_test": y_test,
+                            "subj_label_train": subj_label_train,
+                            "subj_label_test": subj_label_test,
+                            "measure_name": measure_name,
+                        }
+            # save the data
+            # save each task in a separate file and name the file as the task name, measure name, and dataset name
+            for task in DATA.keys():
+                if use_raw_features:
+                    X_train = DATA[task]["X_train"]
+                    X_test = DATA[task]["X_test"]
+                else:
+                    X_train = DATA[task]["X_train_embedded"]
+                    X_test = DATA[task]["X_test_embedded"]
+                y_train = DATA[task]["y_train"]
+                y_test = DATA[task]["y_test"]
+                subj_label_train = DATA[task]["subj_label_train"]
+                subj_label_test = DATA[task]["subj_label_test"]
+                measure_name = DATA[task]["measure_name"]
+
+                if X_train is None or X_test is None:
+                    print(f"Skipping task {task} due to embedding error.")
+                    continue
+
+                # np.save(f"{output_root}/processed_data/{dataset}_{task}_{measure_name}.npy", DATA[task])
+
+                SORT_FEATURES = True
+                ZSCORE = True
+                V_RANGE = 2.0  # heatmap color range after z-scoring
+
+                for group, X, y in zip(
+                    ["train", "test"], [X_train, X_test], [y_train, y_test]
+                ):
+
+                    # X: (n_samples, n_features) = LE-transformed dFC features
+                    # y: (n_samples,) binary (0=rest, 1=task)
+                    # Optional: z-score features so the imshow uses comparable scales
+                    Xz = X.copy().astype(float)
+                    if ZSCORE:
+                        Xz = (Xz - Xz.mean(0)) / (Xz.std(0) + 1e-8)
+
+                    # --- supervised feature order ---
+                    t, p = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
+                    if SORT_FEATURES:
+                        if group == "train":
+                            # if test, use train's t-stat order
+                            col_order = np.argsort(
+                                -np.abs(t)
+                            )  # strongest class contrast first
+                    else:
+                        col_order = np.arange(Xz.shape[1])  # original order
+
+                    # --- row order: cluster within each class (cosine is nice for patterns) ---
+                    def order_rows(A):
+                        if len(A) <= 2:
+                            return np.arange(len(A))
+                        return leaves_list(linkage(A, method="average", metric="cosine"))
+
+                    rest_idx = np.where(y == 0)[0]
+                    task_idx = np.where(y == 1)[0]
+                    rest_order = rest_idx[order_rows(Xz[rest_idx])]
+                    task_order = task_idx[order_rows(Xz[task_idx])]
+                    row_order = np.r_[rest_order, task_order]
+                    split = len(rest_order)
+
+                    # --- plot: main heatmap + class strip + top contrast bar ---
+                    fig = plt.figure(figsize=(20, 10))
+                    gs = fig.add_gridspec(
+                        nrows=2,
+                        ncols=2,
+                        height_ratios=[0.07, 1],
+                        width_ratios=[1, 0.03],
+                        hspace=0.05,
+                        wspace=0.05,
+                    )
+                    ax_top = fig.add_subplot(gs[0, 0])
+                    ax_main = fig.add_subplot(gs[1, 0])
+                    ax_lab = fig.add_subplot(gs[1, 1])
+
+                    # top bar: signed t-stat per feature (same column order)
+                    t_ord = t[col_order]
+                    m = np.nanmax(np.abs(t_ord))
+                    im_top = ax_top.imshow(
+                        t_ord[None, :], aspect="auto", cmap="coolwarm", vmin=-m, vmax=m
+                    )
+                    ax_top.set_xticks([])
+                    ax_top.set_yticks([])
+                    ax_top.set_title(
+                        "Feature contrast (t-stat): task − rest", fontsize=10
+                    )
+
+                    # main heatmap
+                    im = ax_main.imshow(
+                        Xz[row_order][:, col_order],
+                        aspect="auto",
+                        cmap="coolwarm",
+                        vmin=-V_RANGE,
+                        vmax=V_RANGE,
+                    )
+                    ax_main.axhline(
+                        split - 0.5, color="k", lw=1
+                    )  # separator between rest/task
+                    ax_main.set_ylabel("samples")
+                    ax_main.set_xticks([])
+
+                    # class strip (right)
+                    cmap_lbl = ListedColormap(
+                        [[0.85, 0.85, 0.85], [0.25, 0.5, 0.9]]
+                    )  # gray=rest, blue=task
+                    ax_lab.imshow(
+                        y[row_order][:, None],
+                        aspect="auto",
+                        cmap=cmap_lbl,
+                        vmin=0,
+                        vmax=1,
+                    )
+                    ax_lab.set_xticks([])
+                    ax_lab.set_yticks([])
+                    ax_lab.set_title("class")
+
+                    # --- vertical centers in the main heatmap (data coords) ---
+                    n_rows = Xz.shape[0]
+                    y_center_rest = (split - 1) / 2.0 if split > 0 else -0.5
+                    y_center_task = (
+                        (split + (n_rows - 1)) / 2.0 if n_rows > split else split - 0.5
+                    )
+
+                    # centers (already computed)
+                    # y_center_rest, y_center_task
+                    # x positions on the strip:
+                    x_right_ax = 1.0 - 0.02
+
+                    # blended: x in ax_lab axes coords, y in ax_main data coords
+                    blended = mtransforms.blended_transform_factory(
+                        ax_lab.transAxes, ax_main.transData
+                    )
+
+                    # left label: place at strip edge, nudge  +6 pts right
+                    ax_main.annotate(
+                        "rest (0)",
+                        xy=(x_right_ax, y_center_rest),
+                        xycoords=blended,
+                        xytext=(6, 0),
+                        textcoords="offset points",  # -> to the right of the strip
+                        ha="left",
+                        va="center",
+                        fontsize=9,
+                        zorder=7,
+                    )
+
+                    # right label: place at strip edge, nudge  +6 pts further right
+                    ax_main.annotate(
+                        "task (1)",
+                        xy=(x_right_ax, y_center_task),
+                        xycoords=blended,
+                        xytext=(6, 0),
+                        textcoords="offset points",  # -> outside the strip on the right
+                        ha="left",
+                        va="center",
+                        fontsize=9,
+                        zorder=7,
+                    )
+
+                    # colorbar (small)
+                    cax = fig.add_axes(
+                        [0.12, 0.06, 0.3, 0.02]
+                    )  # x, y, w, h in figure coords
+                    cb = plt.colorbar(im, cax=cax, orientation="horizontal")
+                    cb.set_label("z-scored feature value", fontsize=9)
+                    cb.ax.tick_params(labelsize=8)
+
+                    # if the folder does not exist, create it
+                    if not os.path.exists(f"feature-sample/{measure_name}"):
+                        os.makedirs(f"feature-sample/{measure_name}")
+
+                    plt.savefig(
+                        f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
+                        dpi=150,
+                        bbox_inches="tight",
+                        pad_inches=0.2,
+                        format="png",
+                    )
+                    plt.close()
+
+                    # plot unsorted version as well
+                    print(f"Embedded shape: {X.shape}")
+
+                    # plot X_embedded and y in an imshow as subplots
+                    if group == "train":
+                        w = 30
+                        h = 10
+                    elif group == "test":
+                        w = 10
+                        h = 10
+                    # fig, axs = plt.subplots(2, 1, figsize=(w, h))
+                    # make bottom subplot skinny like a color strip
+                    fig, axs = plt.subplots(
+                        2,
+                        1,
+                        figsize=(w, h),
+                        sharex=True,
+                        gridspec_kw={"height_ratios": [15, 1], "hspace": 0.1},
+                    )
+
+                    split = np.sum(y == 0)  # number of rest samples
+                    axs[0].imshow(X.T, aspect="auto", origin="lower", cmap="seismic")
+                    axs[0].set_title("LE Embedded Features")
+                    axs[0].set_xlabel("Sample")
+                    axs[0].set_ylabel("Feature")
+
+                    axs[1].imshow(
+                        y[np.newaxis, :], aspect=20, origin="lower", cmap="seismic"
+                    )
+                    axs[1].set_title("Target")
+                    axs[1].set_xticks([])
+                    axs[1].set_yticks([])
+                    plt.savefig(
+                        f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
+                        dpi=150,
+                        bbox_inches="tight",
+                        pad_inches=0.2,
+                        format="png",
+                    )
+                    plt.close()
+
+                    # sort the data such that y=1 samples come first
+                    sort_indices = np.argsort(y, kind="stable")
+                    X = X[sort_indices]
+                    y = y[sort_indices]
+
+                    # plot X_embedded and y in an imshow as subplots
+                    # fig, axs = plt.subplots(2, 1, figsize=(w, h))
+                    # make bottom subplot skinny like a color strip
+                    fig, axs = plt.subplots(
+                        2,
+                        1,
+                        figsize=(w, h),
+                        sharex=True,
+                        gridspec_kw={"height_ratios": [15, 1], "hspace": 0.1},
+                    )
+
+                    axs[0].imshow(X.T, aspect="auto", origin="lower", cmap="seismic")
+                    axs[0].axvline(
+                        split - 0.5, color="k", lw=3
+                    )  # separator between rest/task
+                    axs[0].set_title("LE Embedded Features")
+                    axs[0].set_xlabel("Sample")
+                    axs[0].set_ylabel("Feature")
+
+                    axs[1].imshow(
+                        y[np.newaxis, :], aspect=20, origin="lower", cmap="seismic"
+                    )
+                    axs[1].set_title("Target")
+                    axs[1].set_xticks([])
+                    axs[1].set_yticks([])
+                    plt.savefig(
+                        f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
+                        dpi=150,
+                        bbox_inches="tight",
+                        pad_inches=0.2,
+                        format="png",
+                    )
+                    plt.close()
diff --git a/task_dFC/multi_dataset_analysis/task_presence_binarization.py b/task_dFC/multi_dataset_analysis/task_presence_binarization.py
new file mode 100644
index 0000000..a736a74
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/task_presence_binarization.py
@@ -0,0 +1,219 @@
+import argparse
+import json
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from pydfc.ml_utils import find_available_subjects, load_task_data
+from pydfc.task_utils import extract_task_presence
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to visualize task timing and binarization results for multiple datasets.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    print("Multi-Dataset Analysis started ...")
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+    output_root = f"{multi_dataset_info['output_root']}/task_timing/{simul_or_real}"
+
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
+    for dataset in DATASETS:
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for task in TASKS:
+            features_all = None
+            for session in SESSIONS:
+                print(f"Processing task: {task}")
+                SUBJECTS = find_available_subjects(
+                    dFC_root=dFC_root,
+                    task=task,
+                    dFC_id=None,
+                    session=session,
+                )
+                subj = SUBJECTS[0]
+
+                run = RUNS[task][0]
+                try:
+                    task_data = load_task_data(
+                        roi_root=roi_root, subj=subj, task=task, run=run, session=session
+                    )
+                except:
+                    continue
+
+                stimulus_timing = np.multiply(task_data["event_labels"] != 0, 1)
+
+                event_labels_all_task_hrf, _ = extract_task_presence(
+                    event_labels=task_data["event_labels"],
+                    TR_task=1 / task_data["Fs_task"],
+                    TR_mri=1 / task_data["Fs_task"],
+                    TR_array=None,
+                    binary=False,
+                    binarizing_method="GMM",
+                    no_hrf=False,
+                )
+
+                task_presence, indices = extract_task_presence(
+                    event_labels=task_data["event_labels"],
+                    TR_task=1 / task_data["Fs_task"],
+                    TR_mri=1 / task_data["Fs_task"],
+                    TR_array=None,
+                    binary=True,
+                    binarizing_method="GMM",
+                    no_hrf=False,
+                )
+                # plot event_labels_all_task_hrf
+                plt.figure(figsize=(250, 10))
+                print(
+                    f"Fs_task: {task_data['Fs_task']}, TR_mri: {task_data['TR_mri']}, length of event_labels_all_task_hrf: {len(event_labels_all_task_hrf)}"
+                )
+                plt.plot(
+                    stimulus_timing,
+                    label="Stimulus Timing",
+                    color="#B8AD6F",
+                    linewidth=15,
+                )
+                # plt.plot(task_presence, label="Task Presence", color="blue", linewidth=3)
+                plt.plot(
+                    event_labels_all_task_hrf,
+                    label="HRF Convolved",
+                    color="#010101",
+                    linewidth=8,
+                )
+                # plot a vertical dashed line at every TR_mri
+                for i in range(
+                    0,
+                    len(event_labels_all_task_hrf),
+                    int(task_data["TR_mri"] * task_data["Fs_task"]),
+                ):
+                    plt.axvline(x=i, color="#c20707", linestyle="--", linewidth=5.0)
+
+                # on_indices are index in indices where task_presence=1
+                on_indices = indices[task_presence[indices] == 1]
+                # off_indices are index in indices where task_presence=0
+                off_indices = indices[task_presence[indices] == 0]
+                plt.scatter(
+                    on_indices,
+                    event_labels_all_task_hrf[on_indices],
+                    color="#7ab3dc",
+                    label="on_indices",
+                    s=300,
+                    zorder=10,
+                )
+                plt.scatter(
+                    off_indices,
+                    event_labels_all_task_hrf[off_indices],
+                    color="#A8ACAD",
+                    label="off_indices",
+                    s=300,
+                    zorder=10,
+                )
+
+                # remove all axis and spines, show only x axis
+                plt.gca().spines["top"].set_visible(False)
+                plt.gca().spines["right"].set_visible(False)
+                plt.gca().spines["left"].set_visible(False)
+                plt.gca().spines["bottom"].set_visible(True)
+                # increase bottom spine width
+                plt.gca().spines["bottom"].set_linewidth(5)
+                plt.gca().yaxis.set_visible(False)
+                plt.gca().xaxis.set_visible(True)
+
+                # # set background color to lite pink
+                # plt.gca().set_facecolor("#F7EFEF")
+
+                # set x ticks to be every TR_mri
+                step_factor = 1
+                # if the length of event_labels_all_task_hrf > 6500, set step_factor to 5
+                # to make the plot less crowded
+                if len(event_labels_all_task_hrf) > 6500:
+                    step_factor = (
+                        np.ceil(len(event_labels_all_task_hrf) / 6500).astype(int) + 1
+                    )
+                step = int(
+                    round(task_data["TR_mri"] * task_data["Fs_task"] * step_factor)
+                )
+                step = max(step, 1)  # avoid step=0
+
+                ticks = np.arange(0, len(event_labels_all_task_hrf), step)
+                plt.gca().set_xticks(ticks)
+
+                TR_labels = np.arange(
+                    len(ticks) * step_factor, step=step_factor
+                )  # same length as ticks
+                # label each tick as time in seconds, TR_labels*TR_mri
+                time_labels = np.round(TR_labels * task_data["TR_mri"]).astype(int)
+                plt.gca().set_xticklabels(time_labels, fontsize=50)
+                plt.xlabel("Time (sec)", fontsize=60)
+
+                plt.savefig(
+                    f"{output_root}/task_timing_{task}.png",
+                    dpi=120,
+                    bbox_inches="tight",
+                    pad_inches=0.1,
+                    format="png",
+                )
+                if task == "task-Localizer":
+                    plt.savefig(
+                        f"{output_root}/task_timing_{task}.svg",
+                        dpi=120,
+                        bbox_inches="tight",
+                        pad_inches=0.1,
+                        format="svg",
+                    )
+
+                plt.close()
diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
new file mode 100644
index 0000000..fbc9d84
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -0,0 +1,388 @@
+import argparse
+import json
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from pydfc.data_loader import find_subj_list
+from pydfc.ml_utils import load_task_data
+from pydfc.task_utils import (
+    calc_relative_task_on,
+    calc_rest_duration,
+    calc_task_duration,
+    calc_transition_freq,
+    extract_task_presence,
+)
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    annotate_medians_by_geometry,
+    annotate_medians_single_boxplot,
+    as_long_df,
+    order_by_median_dict,
+    setup_pub_style,
+)
+
+fig_bbox_inches = "tight"
+fig_pad = 0.1
+show_title = False
+save_fig_format = "png"  # pdf, png,
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to analyze and visualize task timing statistics across multiple datasets.
+    """
+
+    setup_pub_style()
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+    output_root = f"{multi_dataset_info['output_root']}/task_data_stats/{simul_or_real}"
+
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
+    task_ratio_all = {}
+    transition_freq_all = {}
+    rest_durations_all = {}
+    task_durations_all = {}
+    for dataset in DATASETS:
+
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for session in SESSIONS:
+            for task_id, task in enumerate(TASKS):
+                if not task in TASKS_to_include:
+                    continue
+                for run in RUNS[task]:
+                    SUBJECTS = find_subj_list(roi_root)
+                    # print(f"Number of subjects: {len(SUBJECTS)}")
+
+                    for subj in SUBJECTS:
+
+                        try:
+                            task_data = load_task_data(
+                                roi_root=roi_root,
+                                subj=subj,
+                                task=task,
+                                run=run,
+                                session=session,
+                            )
+                        except FileNotFoundError:
+                            continue
+
+                        task_presence, indices = extract_task_presence(
+                            event_labels=task_data["event_labels"],
+                            TR_task=1 / task_data["Fs_task"],
+                            TR_mri=task_data["TR_mri"],
+                            binary=True,
+                            binarizing_method="GMM",
+                            no_hrf=False,
+                        )
+
+                        relative_task_on = calc_relative_task_on(task_presence[indices])
+                        num_of_transitions, relative_transition_freq = (
+                            calc_transition_freq(task_presence[indices])
+                        )
+                        # calculate rest and task durations based original event labels
+                        event_labels = np.multiply(task_data["event_labels"] != 0, 1)
+                        rest_durations = calc_rest_duration(
+                            event_labels, TR_mri=1 / task_data["Fs_task"]
+                        )
+                        task_durations = calc_task_duration(
+                            event_labels, TR_mri=1 / task_data["Fs_task"]
+                        )
+
+                        if not task in task_ratio_all:
+                            task_ratio_all[task] = []
+                        if not task in transition_freq_all:
+                            transition_freq_all[task] = []
+                        if not task in rest_durations_all:
+                            rest_durations_all[task] = []
+                        if not task in task_durations_all:
+                            task_durations_all[task] = []
+                        task_ratio_all[task].append(relative_task_on)
+                        transition_freq_all[task].append(relative_transition_freq)
+                        # rest_durations and task_durations are lists
+                        rest_durations_all[task].extend(rest_durations)
+                        task_durations_all[task].extend(task_durations)
+
+    DATA = {
+        "task_ratio_all": task_ratio_all,
+        "transition_freq_all": transition_freq_all,
+        "rest_durations_all": rest_durations_all,
+        "task_durations_all": task_durations_all,
+    }
+    # np.save(f"task_data_stats_{simul_or_real}.npy", DATA)
+
+    # =========================
+    # Paper-quality seaborn plots (patched)
+    # =========================
+
+    sns.set_theme(context="paper", style="darkgrid")
+    plt.rcParams.update(
+        {
+            "figure.dpi": 300,
+            "savefig.dpi": 500,
+            "pdf.fonttype": 42,
+            "ps.fonttype": 42,
+            "axes.labelweight": "bold",
+            "axes.titleweight": "bold",
+            "axes.labelsize": 14,
+            "axes.titlesize": 16,
+            "xtick.labelsize": 11,
+            "ytick.labelsize": 11,
+            "legend.fontsize": 12,
+        }
+    )
+
+    # ==============================
+    # 1) Task ratio (sorted by median) — BOX PLOT + median labels
+    # ==============================
+    order_ratio, stats_ratio = order_by_median_dict(task_ratio_all, reverse=True)
+    df_ratio = as_long_df(task_ratio_all, "task_ratio")
+    df_ratio = df_ratio[df_ratio["task"].isin(order_ratio)]
+    df_ratio["task"] = pd.Categorical(
+        df_ratio["task"], categories=order_ratio, ordered=True
+    )
+
+    fig_w = max(15, 15 / 30 * len(order_ratio))
+    plt.figure(figsize=(fig_w, 6))
+
+    ax = sns.boxplot(
+        data=df_ratio,
+        x="task",
+        y="task_ratio",
+        order=order_ratio,
+        width=0.6,
+        linewidth=1,
+        showfliers=False,
+    )
+
+    ax.set_xlabel("Task paradigm")
+    ax.set_ylabel("Task ratio")
+    ax.set_ylim(0, 1)  # keep ratios bounded
+
+    # annotate medians (use integers if you prefer: fmt="{:.0f}")
+    annotate_medians_single_boxplot(
+        ax,
+        df_ratio,
+        x_col="task",
+        y_col="task_ratio",
+        order=order_ratio,
+        fmt="{:.2f}",
+        box_alpha=0.6,
+    )
+
+    for label in ax.get_xticklabels():
+        label.set_rotation(65)
+        label.set_horizontalalignment("right")
+        label.set_fontweight("bold")
+    if show_title:
+        ax.set_title("Task ratio per task (box + samples, ordered by median)", pad=12)
+
+    plt.tight_layout()
+    plt.savefig(
+        f"{output_root}/task_ratio_{simul_or_real}.{save_fig_format}",
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+    )
+    plt.close()
+
+    # ======================================
+    # 2) Transition frequency (sorted by median) — BOX PLOT + median labels
+    # ======================================
+    order_tf, stats_tf = order_by_median_dict(transition_freq_all, reverse=True)
+    df_tf = as_long_df(transition_freq_all, "transition_freq")
+    df_tf = df_tf[df_tf["task"].isin(order_tf)]
+    df_tf["task"] = pd.Categorical(df_tf["task"], categories=order_tf, ordered=True)
+
+    fig_w = max(15, 15 / 30 * len(order_tf))
+    plt.figure(figsize=(fig_w, 6))
+
+    ax = sns.boxplot(
+        data=df_tf,
+        x="task",
+        y="transition_freq",
+        order=order_tf,
+        width=0.6,
+        linewidth=1,
+        showfliers=False,
+    )
+
+    ax.set_xlabel("Task paradigm")
+    ax.set_ylabel("Relative transition frequency")
+
+    # annotate medians
+    annotate_medians_single_boxplot(
+        ax,
+        df_tf,
+        x_col="task",
+        y_col="transition_freq",
+        order=order_tf,
+        fmt="{:.2f}",
+        box_alpha=0.6,
+    )
+
+    for label in ax.get_xticklabels():
+        label.set_rotation(65)
+        label.set_horizontalalignment("right")
+        label.set_fontweight("bold")
+    if show_title:
+        ax.set_title(
+            "Transition frequency per task (box + samples, ordered by median)", pad=12
+        )
+
+    plt.tight_layout()
+    plt.savefig(
+        f"{output_root}/transition_freq_{simul_or_real}.{save_fig_format}",
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+    )
+    plt.close()
+
+    # =========================================================
+    # 3) Rest vs Task durations: side-by-side per task paradigm (LOG SCALE)
+    # =========================================================
+    df_rest = as_long_df(rest_durations_all, "duration")
+    df_rest["state"] = "Rest"
+    df_task = as_long_df(task_durations_all, "duration")
+    df_task["state"] = "Task"
+    df_dur = pd.concat([df_rest, df_task], ignore_index=True)
+
+    # Order tasks by mean Task duration (change to Rest if you prefer)
+    order_dur, _ = order_by_median_dict(task_durations_all, reverse=True)
+    df_dur = df_dur[df_dur["task"].isin(order_dur)]
+    df_dur["task"] = pd.Categorical(df_dur["task"], categories=order_dur, ordered=True)
+
+    # ---- LOG display handling (avoid -inf for zeros) ----
+    # pick an adaptive epsilon based on the smallest positive value
+    pos = df_dur.loc[df_dur["duration"] > 0, "duration"]
+    if len(pos) == 0:
+        EPS = 1e-3
+    else:
+        EPS = max(min(pos) / 10.0, 1e-3)  # small but data-driven
+    df_dur["duration_plot"] = df_dur["duration"].clip(lower=EPS)
+
+    fig_w = max(17, 17 / 30 * len(order_dur))
+    plt.figure(figsize=(fig_w, 7))
+
+    # Boxplot on log scale (no fliers; jitters will show samples, incl. singletons)
+    ax = sns.boxplot(
+        data=df_dur,
+        x="task",
+        y="duration_plot",
+        hue="state",
+        order=order_dur,
+        hue_order=["Rest", "Task"],
+        linewidth=1,
+        dodge=True,
+        showfliers=False,
+        width=0.6,
+    )
+
+    # Put y-axis on log scale (preserves wide dynamic range)
+    ax.set_yscale("log")
+
+    # annotate medians on the median line (log-scale safe)
+    annotate_medians_by_geometry(
+        ax=ax,
+        df_long=df_dur,  # the DF you plotted
+        x_col="task",
+        hue_col="state",
+        y_col="duration_plot",  # the epsilon-clipped column you used for plotting
+        x_order=order_dur,
+        hue_order=["Rest", "Task"],
+        fmt="{:.0f}",
+        y_nudge_factor=1.08,  # bump if labels sit on the line in log-space
+        bin_halfwidth=0.6,  # widen if categories are very tightly packed
+        bbox_alpha=0.6,  # make label bg more opaque for legibility
+    )
+
+    # Clean up duplicated legends (boxplot + stripplot both add entries)
+    handles, labels = ax.get_legend_handles_labels()
+    # the first two unique handles correspond to Rest/Task once; keep those
+    unique = []
+    seen = set()
+    for h, l in zip(handles, labels):
+        if l not in seen:
+            unique.append((h, l))
+            seen.add(l)
+    # Keep only Rest/Task (first two)
+    handles_clean, labels_clean = (
+        zip(*unique[:2]) if len(unique) >= 2 else (handles[:2], labels[:2])
+    )
+    ax.legend(handles_clean, labels_clean, title="", frameon=True, loc="upper right")
+
+    ax.set_xlabel("Task paradigm")
+    ax.set_ylabel("Duration (sec, log scale)")
+
+    for label in ax.get_xticklabels():
+        label.set_rotation(65)
+        label.set_horizontalalignment("right")
+        label.set_fontweight("bold")
+
+    if show_title:
+        ax.set_title("Rest vs Task durations per task (log scale; box + points)", pad=12)
+
+    plt.tight_layout()
+    plt.savefig(
+        f"{output_root}/durations_rest_vs_task_{simul_or_real}.{save_fig_format}",
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+    )
+    plt.close()
+    # =========================================================
diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
index 93a5a69..2bc1f21 100644
--- a/task_dFC/run_scripts_slurm/multi_dataset_info.json
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -1,7 +1,34 @@
 {
-	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
-	"DATASETS" : [
-		"ds001734", "ds002843", "ds003465", "ds004044", "ds004359", "ds004746",
-		"ds002647", "ds002994", "ds003612", "ds004302", "ds004556"
-	]
+	"output_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/multi_dataset_analysis/results",
+	"real_data": {
+		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
+		"DATASETS": [
+			"ds001734", "ds002843", "ds003465", "ds004044", "ds004359", "ds004746",
+			"ds002647", "ds002994", "ds003612", "ds004302", "ds004556"
+		],
+		"TASKS_to_include": [
+			"task-arithmetic", "task-AudSem", "task-Axcpt", "task-CIC",
+			"task-Cuedts", "task-emotionRegulation", "task-execution", "task-expo",
+			"task-fearlearning", "task-feedback", "task-fribBids", "task-IHG",
+			"task-imagery", "task-itc", "task-localiser", "task-Localizer",
+			"task-matching", "task-motor", "task-paingen", "task-ppalocalizer",
+			"task-recall", "task-risk", "task-ST", "task-Stern",
+			"task-Stroop", "task-VisRhyme", "task-VisSem", "task-VisSpell",
+			"task-vswm"
+    	]
+	},
+	"simulated_data": {
+		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/simulated",
+		"DATASETS": [
+			"ds000001", "ds000002", "ds000003", "ds000004", "ds000005", "ds000006"
+		],
+		"TASKS_to_include": [
+			"task-Axcpt", "task-Cuedts", "task-Stern", "task-Stroop",
+			"task-lowFreqLongRest", "task-lowFreqShortRest", "task-lowFreqShortTask",
+			"task-imagery", "task-execution",
+			"task-itc", "task-risk",
+			"task-Localizer",
+			"task-ppalocalizer"
+		]
+	}
 }
diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
new file mode 100644
index 0000000..42ff2f3
--- /dev/null
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+#
+#SBATCH --job-name=across_dataset_analysis
+#SBATCH --output=logs/%x_out.txt
+#SBATCH --error=logs/%x_err.txt
+#SBATCH --time=05:00:00
+#SBATCH --mem=32G
+
+# === Activate virtual environment ===
+source "/home/mt00/venvs/pydfc/bin/activate"
+
+# === Global variable ===
+MULTI_DATASET_INFO="/home/mt00/pydfc/dFC/task_dFC/run_scripts_slurm/multi_dataset_info.json"
+
+# === Arguments ===
+SCRIPT_NAME=$1        # e.g., ml_results.py
+SIMUL_OR_REAL=$2      # e.g., real or simulated
+SCRIPT_DIR="/home/mt00/pydfc/dFC/task_dFC/multi_dataset_analysis"
+
+# === Safety checks ===
+if [ -z "$SCRIPT_NAME" ]; then
+    echo "Usage: sbatch run_analysis.sh <script_name> [real|simulated]"
+    exit 1
+fi
+
+if [ -z "$SIMUL_OR_REAL" ]; then
+    SIMUL_OR_REAL="real"  # default
+fi
+
+SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_NAME"
+
+if [ ! -f "$SCRIPT_PATH" ]; then
+    echo "Error: Script '$SCRIPT_PATH' not found."
+    exit 1
+fi
+
+# === Run based on script name ===
+case "$SCRIPT_NAME" in
+    ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
+        python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
+        ;;
+    cohensd.py)
+        python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO"
+        ;;
+    *)
+        echo "Unknown script: $SCRIPT_NAME"
+        exit 1
+        ;;
+esac
+
+# === Deactivate virtual environment ===
+deactivate
diff --git a/task_dFC/validation.py b/task_dFC/validation.py
deleted file mode 100644
index 05fcb24..0000000
--- a/task_dFC/validation.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import os
-import time
-import warnings
-
-import numpy as np
-
-from pydfc import MultiAnalysis, data_loader
-
-warnings.simplefilter("ignore")
-
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
-
-################################# Parameters #################################
-
-# Data parameters
-# main_root = '../../DATA/ds002785/' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785/"  # for server
-dFC_assessed_root = main_root + "dFC_assessed/"
-output_root = main_root + "validation_results/"
-
-################################# LOAD FIT MEASURES #################################
-
-SUBJECTS = data_loader.find_subj_list(
-    data_root=roi_root, sessions=params_data_load["SESSIONs"]
-)
-
-ALL_RECORDS = os.listdir(dFC_assessed_root)
-ALL_RECORDS = [i for i in ALL_RECORDS if "dFC" in i]
-ALL_RECORDS.sort()
-dFC_lst = list()
-for s in ALL_RECORDS:
-    dFC = np.load(dFC_assessed_root + s, allow_pickle="TRUE").item()
-    dFC_lst.append(dFC)
-print("dFCs loaded ...")
-
-################################# SIMILARITY MEASUREMENT #################################
-
-# similarity_assessment = SIMILARITY_ASSESSMENT(dFCM_lst=dFCM_dict['dFCM_lst'])
-
-# tic = time.time()
-# print('Measurement Started ...')
-
-# print("Similarity measurement started...")
-# SUBJ_output = similarity_assessment.run(FILTERS=dFC_analyzer.hyper_param_info, downsampling_method='default')
-# print("Similarity measurement done.")
-
-# print('Measurement required %0.3f seconds.' % (time.time() - tic, ))
-
-# # Save
-# folder = output_root+'similarity_measured'
-# if not os.path.exists(folder):
-#     os.makedirs(folder)
-
-# np.save(folder+'/SUBJ_'+str(subj_id)+'_output.npy', SUBJ_output)
-
-#######################################################################################

From 7aa52f5345f3244dfa54e639604ba7d9f8eb6bbc Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 15 Oct 2025 22:50:49 -0400
Subject: [PATCH 261/401] minor fix

---
 task_dFC/run_scripts_slurm/multi_dataset_info.json | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
index 2bc1f21..cb32fd2 100644
--- a/task_dFC/run_scripts_slurm/multi_dataset_info.json
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -3,12 +3,16 @@
 	"real_data": {
 		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
 		"DATASETS": [
-			"ds001734", "ds002843", "ds003465", "ds004044", "ds004359", "ds004746",
-			"ds002647", "ds002994", "ds003612", "ds004302", "ds004556"
+			"ds001242", "ds001734", "ds002236", "ds002647",
+			"ds002785", "ds002843", "ds002994", "ds003242",
+			"ds003465", "ds003612", "ds003717", "ds003823",
+			"ds004044", "ds004302", "ds004349", "ds004359",
+			"ds004556", "ds004711", "ds004746", "ds004791",
+			"ds004848", "ds005038"
 		],
 		"TASKS_to_include": [
 			"task-arithmetic", "task-AudSem", "task-Axcpt", "task-CIC",
-			"task-Cuedts", "task-emotionRegulation", "task-execution", "task-expo",
+			"task-Cuedts", "task-emotionRegulation", "task-execution","task-expo",
 			"task-fearlearning", "task-feedback", "task-fribBids", "task-IHG",
 			"task-imagery", "task-itc", "task-localiser", "task-Localizer",
 			"task-matching", "task-motor", "task-paingen", "task-ppalocalizer",

From bd4766f407a3adfbb6913739237db6b32b2c0008 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 16 Oct 2025 15:32:40 -0400
Subject: [PATCH 262/401] fix minor

---
 .../multi_dataset_analysis/LE_embedding_visualization.py  | 6 ++----
 task_dFC/multi_dataset_analysis/ml_results.py             | 8 ++++----
 task_dFC/multi_dataset_analysis/task_timing_stats.py      | 4 ++--
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
index 4f278b2..6a2298a 100644
--- a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
+++ b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
@@ -56,9 +56,7 @@
         DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
         TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
 
-    output_root = (
-        f"{multi_dataset_info['output_root']}/task_presence_embed/{simul_or_real}"
-    )
+    output_root = f"{multi_dataset_info['output_root']}/LE_embed/{simul_or_real}"
 
     if not os.path.exists(output_root):
         os.makedirs(output_root)
@@ -180,7 +178,7 @@
                             plt.legend()
 
                             plt.savefig(
-                                f"{output_root}/task_presence_embed_{task}_{measure_name}.png",
+                                f"{output_root}/LE_embed_{task}_{measure_name}.png",
                                 dpi=fig_dpi,
                                 bbox_inches=fig_bbox_inches,
                                 pad_inches=fig_pad,
diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index e849704..9c7ee9b 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -192,7 +192,7 @@
         # --- domain tagging & task ordering/coloring (only for real data) ---
         if simul_or_real == "real":
             df["domain"] = df["task"].map(task_domain_real)
-        elif simul_or_real == "simul":
+        elif simul_or_real == "simulated":
             df["domain"] = df["task"].map(task_domain_simul)
         # Use tasks present in THIS slice
         task_order, task_palette = build_task_order_and_palette(
@@ -335,7 +335,7 @@
                 markersize=5,
             )
             ax_leg.set_title("Task Paradigm", fontsize=9, pad=4, fontweight="bold")
-        elif simul_or_real == "simul":
+        elif simul_or_real == "simulated":
             domain_of = {t: task_domain_simul(t) for t in task_order}
             draw_grouped_legend_panel(
                 ax_leg,
@@ -408,7 +408,7 @@
             }  # your task_domain helper
             domain_x_frac = -0.8
             ylabel_pad_pts = 130
-        elif simul_or_real == "simul":
+        elif simul_or_real == "simulated":
             task_to_domain = {
                 t: task_domain_simul(t) for t in row_order
             }  # your task_domain helper
@@ -507,7 +507,7 @@
                 task_to_domain = {t: task_domain_real(t) for t in row_order}
                 domain_x_frac = -0.8
                 ylabel_pad_pts = 130
-            elif simul_or_real == "simul":
+            elif simul_or_real == "simulated":
                 task_to_domain = {t: task_domain_simul(t) for t in row_order}
                 domain_x_frac = -1.0
                 ylabel_pad_pts = 110
diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
index fbc9d84..9158546 100644
--- a/task_dFC/multi_dataset_analysis/task_timing_stats.py
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -67,7 +67,7 @@
         main_root = multi_dataset_info["simulated_data"]["main_root"]
         DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
         TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
-    output_root = f"{multi_dataset_info['output_root']}/task_data_stats/{simul_or_real}"
+    output_root = f"{multi_dataset_info['output_root']}/task_timing_stats/{simul_or_real}"
 
     if not os.path.exists(output_root):
         os.makedirs(output_root)
@@ -166,7 +166,7 @@
         "rest_durations_all": rest_durations_all,
         "task_durations_all": task_durations_all,
     }
-    # np.save(f"task_data_stats_{simul_or_real}.npy", DATA)
+    # np.save(f"task_timing_stats_{simul_or_real}.npy", DATA)
 
     # =========================
     # Paper-quality seaborn plots (patched)

From 342e3e36239a92b878b30ebf266f20163b3fef4c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 16 Oct 2025 15:47:50 -0400
Subject: [PATCH 263/401] minor fix

---
 task_dFC/multi_dataset_analysis/helper_functions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index bb3a768..d69ce37 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -160,7 +160,7 @@ def get_cog_domain_info(simul_or_real: str):
             "Functional localizers": "#35cf33",
             "Other": "#646464",
         }
-    elif simul_or_real == "simul":
+    elif simul_or_real == "simulated":
         # --- Categories of simulated task paradigms ---
         DOMAIN_ORDER = [
             "Simulated Periodic",
@@ -213,7 +213,7 @@ def task_domain_real(task: str) -> str:
 
 
 def task_domain_simul(task: str) -> str:
-    _, TASK2DOMAIN, _ = get_cog_domain_info("simul")
+    _, TASK2DOMAIN, _ = get_cog_domain_info("simulated")
     return TASK2DOMAIN.get(canon_task(task), "Other")
 
 
@@ -249,7 +249,7 @@ def build_task_order_and_palette(
     tasks = list(tasks_iterable)
     if simul_or_real == "real":
         dom_of = {t: task_domain_real(t) for t in tasks}
-    elif simul_or_real == "simul":
+    elif simul_or_real == "simulated":
         dom_of = {t: task_domain_simul(t) for t in tasks}
 
     DOMAIN_ORDER, _, DOMAIN_BASE = get_cog_domain_info(simul_or_real)
@@ -281,7 +281,7 @@ def domain_sorted_rows(index_tasks, TASKS_to_include, simul_or_real):
     # domain-first, then alphabetical
     if simul_or_real == "real":
         dom_of = {t: task_domain_real(t) for t in present}
-    elif simul_or_real == "simul":
+    elif simul_or_real == "simulated":
         dom_of = {t: task_domain_simul(t) for t in present}
     DOMAIN_ORDER, _, _ = get_cog_domain_info(simul_or_real)
     ordered = []

From 70b665b7122f2c7a4fd96584d13dc5cf818180d7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 16 Oct 2025 19:25:20 -0400
Subject: [PATCH 264/401] minor fix

---
 task_dFC/multi_dataset_analysis/cohensd.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index 28f6d49..b9791a7 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -1,5 +1,6 @@
 import argparse
 import json
+import os
 
 import matplotlib.pyplot as plt
 import nibabel as nib
@@ -39,6 +40,9 @@
     TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
     output_root = f"{multi_dataset_info['output_root']}/CohensD"
 
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
     CohensD_across_task = {
         "task": [],
         "d_values": [],

From 297a93b554c3c3e892c9d078c90ac57c7ed4ae80 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 17 Oct 2025 11:27:45 -0400
Subject: [PATCH 265/401] minor fix

---
 .../multi_dataset_analysis/sample_matrix_visualization.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 04f6459..3c855b7 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -383,8 +383,8 @@ def order_rows(A):
                     cb.ax.tick_params(labelsize=8)
 
                     # if the folder does not exist, create it
-                    if not os.path.exists(f"feature-sample/{measure_name}"):
-                        os.makedirs(f"feature-sample/{measure_name}")
+                    if not os.path.exists(f"{output_root}/{measure_name}"):
+                        os.makedirs(f"{output_root}/{measure_name}")
 
                     plt.savefig(
                         f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",

From 791b904cc26bbe55f8065ad78f15d6efcec55f24 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 21 Oct 2025 23:03:23 -0400
Subject: [PATCH 266/401] improve sample matrix visualization

---
 .../helper_functions.py                       | 218 ++++++++++++++
 .../sample_matrix_visualization.py            | 272 +++---------------
 .../run_across_dataset_analysis.sh            |  14 +-
 3 files changed, 272 insertions(+), 232 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index d69ce37..5007d11 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -11,6 +11,9 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+from matplotlib.colors import ListedColormap
+from scipy.cluster.hierarchy import leaves_list, linkage
+from scipy.stats import ttest_ind
 
 ###################### Publication style ######################
 
@@ -914,3 +917,218 @@ def figure_dfc_matrices_window_png(
     print(
         f"Saved {outfile}  |  TR columns: {len(idxs)}  |  vmin={vmin:.3f}, vmax={vmax:.3f}  |  dpi={dpi}"
     )
+
+
+###################### sample_matrix plots ######################
+
+
+def plot_samples_features(
+    X,
+    y,
+    *,
+    sample_order="original",  # "original" | "label" | "label+cluster"
+    feature_order="original",  # "original" | "tstat"
+    col_order_from_train=None,  # optional np.ndarray (feature indices) to reuse on test
+    ZSCORE=True,
+    V_RANGE=2.0,
+    cmap="coolwarm",
+    title=None,
+    save_path=None,
+    show=True,
+):
+    """
+    X: (n_samples, n_features) matrix (features in columns)
+    y: (n_samples,) binary (0=rest, 1=task)
+
+    Samples are shown along the horizontal axis (time-like), features along the vertical axis.
+    If feature_order == "tstat", a slim vertical t-stat bar is shown on the LEFT,
+    aligned 1:1 with feature rows (no top t-bar).
+    """
+    # ---------- prep ----------
+    X = np.asarray(X, float)
+    y = np.asarray(y)
+    n_samples, n_features = X.shape
+
+    # z-score per feature
+    Xz = X.copy()
+    if ZSCORE:
+        mu = Xz.mean(axis=0, keepdims=True)
+        sd = Xz.std(axis=0, keepdims=True) + 1e-8
+        Xz = (Xz - mu) / sd
+
+    # ---------- feature order ----------
+    if feature_order == "tstat":
+        if col_order_from_train is not None:
+            col_order = np.asarray(col_order_from_train, int)
+            t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
+            t_ord = t[col_order]
+        else:
+            t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
+            col_order = np.argsort(-np.abs(t))  # strongest contrast first
+            t_ord = t[col_order]
+    else:
+        col_order = np.arange(n_features)
+        t_ord = None  # no t-stat bar
+
+    # ---------- sample order ----------
+    if sample_order == "original":
+        row_order = np.arange(n_samples)
+        split = np.sum(y == 0)
+        draw_separator = False
+    elif sample_order == "label":
+        rest_idx = np.where(y == 0)[0]
+        task_idx = np.where(y == 1)[0]
+        row_order = np.r_[rest_idx, task_idx]
+        split = len(rest_idx)
+        draw_separator = True
+    elif sample_order == "label+cluster":
+
+        def order_rows(A):
+            if len(A) <= 2:
+                return np.arange(len(A))
+            return leaves_list(linkage(A, method="average", metric="cosine"))
+
+        rest_idx = np.where(y == 0)[0]
+        task_idx = np.where(y == 1)[0]
+        rest_order = rest_idx[order_rows(Xz[rest_idx])] if len(rest_idx) else rest_idx
+        task_order = task_idx[order_rows(Xz[task_idx])] if len(task_idx) else task_idx
+        row_order = np.r_[rest_order, task_order]
+        split = len(rest_order)
+        draw_separator = True
+    else:
+        raise ValueError(
+            "sample_order must be one of {'original','label','label+cluster'}"
+        )
+
+    # ---------- figure & layout (no top t-bar) ----------
+    # W = max(10, min(24, n_samples / 30))
+    w_min = 12
+    w_max = 24
+    width_per_100 = 0.5  # additional width per 100 samples
+    W = float(np.clip(w_min + (n_samples / 100.0) * width_per_100, w_min, w_max))
+    H = max(6, min(16, n_features / 30))
+    fig = plt.figure(figsize=(W, H))
+
+    gs = fig.add_gridspec(
+        nrows=2,
+        ncols=1,
+        height_ratios=[1.0, 0.06],  # main heatmap + class strip
+        hspace=0.08,
+    )
+    ax_main = fig.add_subplot(gs[0, 0])
+    ax_lab = fig.add_subplot(gs[1, 0])
+
+    # ---------- main heatmap ----------
+    img = Xz[row_order, :][:, col_order].T  # (features, samples)
+    im = ax_main.imshow(
+        img, aspect="auto", origin="lower", cmap=cmap, vmin=-V_RANGE, vmax=V_RANGE
+    )
+    n_features = img.shape[0]
+    if n_features < 10:
+        yticks_step = 1
+    else:
+        yticks_step = 2
+    ticks = np.arange(0, n_features, yticks_step, dtype=int)
+    ax_main.set_yticks(ticks)
+    labels = [str(int(t) + 1) for t in ticks]
+    ax_main.set_yticklabels(labels)
+    ax_main.set_ylabel("feature", fontsize=12, fontweight="bold")
+    ax_main.set_xlabel("sample", fontsize=12, fontweight="bold")
+    ax_main.set_xticks([])
+    ax_main.tick_params(axis="y", labelsize=10)
+
+    if draw_separator and 0 < split < n_samples:
+        ax_main.axvline(split - 0.5, color="k", lw=1)
+
+    # ---------- bottom class strip ----------
+    y_reordered = y[row_order]
+    cmap_lbl = ListedColormap(
+        [[0.85, 0.85, 0.85], [0.25, 0.5, 0.9]]
+    )  # rest=gray, task=blue
+    ax_lab.imshow(
+        y_reordered[None, :], aspect="auto", origin="lower", cmap=cmap_lbl, vmin=0, vmax=1
+    )
+    ax_lab.set_yticks([])
+    ax_lab.set_xticks([])
+    # ax_lab.set_title("class", fontsize=11, pad=2)
+
+    # show class labels only when there is label grouping
+    if draw_separator:
+        n0 = (y_reordered == 0).sum()
+        n1 = (y_reordered == 1).sum()
+        if n0 > 0:
+            x0 = (n0 - 1) / 2.0
+            ax_lab.annotate(
+                "rest (0)",
+                xy=(x0, -0.35),
+                xycoords=("data", "axes fraction"),
+                ha="center",
+                va="top",
+                fontsize=11,
+                fontweight="bold",
+            )
+        if n1 > 0:
+            x1 = n0 + (n1 - 1) / 2.0
+            ax_lab.annotate(
+                "task (1)",
+                xy=(x1, -0.35),
+                xycoords=("data", "axes fraction"),
+                ha="center",
+                va="top",
+                fontsize=11,
+                fontweight="bold",
+            )
+
+    # --- move the class bar (ax_lab) down a bit ---
+    fig.canvas.draw()  # ensure positions are current
+    lab_box = ax_lab.get_position()  # [x0, y0, width, height] in figure coords
+    down = 0.020  # how much to move down (figure fraction)
+    new_y0 = max(0.01, lab_box.y0 - down)  # keep it inside the figure
+    ax_lab.set_position([lab_box.x0, new_y0, lab_box.width, lab_box.height])
+
+    # (re)grab the updated box for the colorbar placement that comes next
+    lab_box = ax_lab.get_position()
+
+    # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
+    if t_ord is not None:
+        fig.canvas.draw()
+        main_box = ax_main.get_position()  # figure coords
+
+        tbar_left_width = 0.010  # ~2% fig width
+        tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
+
+        x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
+        y0 = main_box.y0
+        w = tbar_left_width
+        h = main_box.height
+
+        ax_tleft = fig.add_axes([x0, y0, w, h])
+        m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
+        ax_tleft.imshow(
+            t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
+        )
+        ax_tleft.set_xticks([])
+        ax_tleft.set_yticks([])
+        ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
+
+    # ---------- colorbar (slightly lower so it doesn't overlap class labels) ----------
+    fig.canvas.draw()
+    lab_box = ax_lab.get_position()
+    cbar_h = 0.02
+    cbar_y = max(0.01, lab_box.y0 - 0.085)  # you liked 0.085
+    cax = fig.add_axes([0.12, cbar_y, 0.30, cbar_h])
+    cb = plt.colorbar(im, cax=cax, orientation="horizontal")
+    cb.set_label("z-scored feature value", fontsize=11, fontweight="bold")
+    cb.ax.tick_params(labelsize=10)
+
+    if title:
+        fig.suptitle(title, y=0.995, fontsize=12, fontweight="bold")
+
+    if save_path:
+        fig.savefig(save_path, dpi=300, bbox_inches="tight", pad_inches=0.15)
+    if show:
+        plt.show()
+    else:
+        plt.close(fig)
+
+    return dict(row_order=row_order, col_order=col_order)
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 3c855b7..e1c9261 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -1,13 +1,9 @@
 import argparse
 import json
 import os
+import sys
 
-import matplotlib.pyplot as plt
-import matplotlib.transforms as mtransforms
 import numpy as np
-from matplotlib.colors import ListedColormap
-from scipy.cluster.hierarchy import leaves_list, linkage
-from scipy.stats import ttest_ind
 
 from pydfc.ml_utils import (
     dFC_feature_extraction,
@@ -16,6 +12,11 @@
     process_SB_features,
 )
 
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import (
+    plot_samples_features,  # pyright: ignore[reportMissingImports]
+)
+
 use_raw_features = False  # if True, use raw dFC features instead of embedded features
 normalize_dFC = True
 FCS_proba_for_SB = True
@@ -237,240 +238,51 @@
 
                 # np.save(f"{output_root}/processed_data/{dataset}_{task}_{measure_name}.npy", DATA[task])
 
-                SORT_FEATURES = True
-                ZSCORE = True
-                V_RANGE = 2.0  # heatmap color range after z-scoring
-
                 for group, X, y in zip(
                     ["train", "test"], [X_train, X_test], [y_train, y_test]
                 ):
-
-                    # X: (n_samples, n_features) = LE-transformed dFC features
-                    # y: (n_samples,) binary (0=rest, 1=task)
-                    # Optional: z-score features so the imshow uses comparable scales
-                    Xz = X.copy().astype(float)
-                    if ZSCORE:
-                        Xz = (Xz - Xz.mean(0)) / (Xz.std(0) + 1e-8)
-
-                    # --- supervised feature order ---
-                    t, p = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
-                    if SORT_FEATURES:
-                        if group == "train":
-                            # if test, use train's t-stat order
-                            col_order = np.argsort(
-                                -np.abs(t)
-                            )  # strongest class contrast first
-                    else:
-                        col_order = np.arange(Xz.shape[1])  # original order
-
-                    # --- row order: cluster within each class (cosine is nice for patterns) ---
-                    def order_rows(A):
-                        if len(A) <= 2:
-                            return np.arange(len(A))
-                        return leaves_list(linkage(A, method="average", metric="cosine"))
-
-                    rest_idx = np.where(y == 0)[0]
-                    task_idx = np.where(y == 1)[0]
-                    rest_order = rest_idx[order_rows(Xz[rest_idx])]
-                    task_order = task_idx[order_rows(Xz[task_idx])]
-                    row_order = np.r_[rest_order, task_order]
-                    split = len(rest_order)
-
-                    # --- plot: main heatmap + class strip + top contrast bar ---
-                    fig = plt.figure(figsize=(20, 10))
-                    gs = fig.add_gridspec(
-                        nrows=2,
-                        ncols=2,
-                        height_ratios=[0.07, 1],
-                        width_ratios=[1, 0.03],
-                        hspace=0.05,
-                        wspace=0.05,
-                    )
-                    ax_top = fig.add_subplot(gs[0, 0])
-                    ax_main = fig.add_subplot(gs[1, 0])
-                    ax_lab = fig.add_subplot(gs[1, 1])
-
-                    # top bar: signed t-stat per feature (same column order)
-                    t_ord = t[col_order]
-                    m = np.nanmax(np.abs(t_ord))
-                    im_top = ax_top.imshow(
-                        t_ord[None, :], aspect="auto", cmap="coolwarm", vmin=-m, vmax=m
-                    )
-                    ax_top.set_xticks([])
-                    ax_top.set_yticks([])
-                    ax_top.set_title(
-                        "Feature contrast (t-stat): task − rest", fontsize=10
-                    )
-
-                    # main heatmap
-                    im = ax_main.imshow(
-                        Xz[row_order][:, col_order],
-                        aspect="auto",
-                        cmap="coolwarm",
-                        vmin=-V_RANGE,
-                        vmax=V_RANGE,
-                    )
-                    ax_main.axhline(
-                        split - 0.5, color="k", lw=1
-                    )  # separator between rest/task
-                    ax_main.set_ylabel("samples")
-                    ax_main.set_xticks([])
-
-                    # class strip (right)
-                    cmap_lbl = ListedColormap(
-                        [[0.85, 0.85, 0.85], [0.25, 0.5, 0.9]]
-                    )  # gray=rest, blue=task
-                    ax_lab.imshow(
-                        y[row_order][:, None],
-                        aspect="auto",
-                        cmap=cmap_lbl,
-                        vmin=0,
-                        vmax=1,
-                    )
-                    ax_lab.set_xticks([])
-                    ax_lab.set_yticks([])
-                    ax_lab.set_title("class")
-
-                    # --- vertical centers in the main heatmap (data coords) ---
-                    n_rows = Xz.shape[0]
-                    y_center_rest = (split - 1) / 2.0 if split > 0 else -0.5
-                    y_center_task = (
-                        (split + (n_rows - 1)) / 2.0 if n_rows > split else split - 0.5
-                    )
-
-                    # centers (already computed)
-                    # y_center_rest, y_center_task
-                    # x positions on the strip:
-                    x_right_ax = 1.0 - 0.02
-
-                    # blended: x in ax_lab axes coords, y in ax_main data coords
-                    blended = mtransforms.blended_transform_factory(
-                        ax_lab.transAxes, ax_main.transData
-                    )
-
-                    # left label: place at strip edge, nudge  +6 pts right
-                    ax_main.annotate(
-                        "rest (0)",
-                        xy=(x_right_ax, y_center_rest),
-                        xycoords=blended,
-                        xytext=(6, 0),
-                        textcoords="offset points",  # -> to the right of the strip
-                        ha="left",
-                        va="center",
-                        fontsize=9,
-                        zorder=7,
-                    )
-
-                    # right label: place at strip edge, nudge  +6 pts further right
-                    ax_main.annotate(
-                        "task (1)",
-                        xy=(x_right_ax, y_center_task),
-                        xycoords=blended,
-                        xytext=(6, 0),
-                        textcoords="offset points",  # -> outside the strip on the right
-                        ha="left",
-                        va="center",
-                        fontsize=9,
-                        zorder=7,
-                    )
-
-                    # colorbar (small)
-                    cax = fig.add_axes(
-                        [0.12, 0.06, 0.3, 0.02]
-                    )  # x, y, w, h in figure coords
-                    cb = plt.colorbar(im, cax=cax, orientation="horizontal")
-                    cb.set_label("z-scored feature value", fontsize=9)
-                    cb.ax.tick_params(labelsize=8)
-
                     # if the folder does not exist, create it
                     if not os.path.exists(f"{output_root}/{measure_name}"):
                         os.makedirs(f"{output_root}/{measure_name}")
 
-                    plt.savefig(
-                        f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
-                        dpi=150,
-                        bbox_inches="tight",
-                        pad_inches=0.2,
-                        format="png",
+                    # A) Unsorted (your first vis, but rotated so time is horizontal)
+                    plot_samples_features(
+                        X,
+                        y,
+                        sample_order="original",
+                        feature_order="original",
+                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
+                        show=False,
                     )
-                    plt.close()
 
-                    # plot unsorted version as well
-                    print(f"Embedded shape: {X.shape}")
+                    # B) Label-sorted (your third vis)
+                    plot_samples_features(
+                        X,
+                        y,
+                        sample_order="label",
+                        feature_order="original",
+                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
+                        show=False,
+                    )
 
-                    # plot X_embedded and y in an imshow as subplots
+                    # C) Label + within-class clustering + t-stat top bar
                     if group == "train":
-                        w = 30
-                        h = 10
+                        orders = plot_samples_features(
+                            X,
+                            y,
+                            sample_order="label+cluster",
+                            feature_order="tstat",
+                            save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
+                            show=False,
+                        )
                     elif group == "test":
-                        w = 10
-                        h = 10
-                    # fig, axs = plt.subplots(2, 1, figsize=(w, h))
-                    # make bottom subplot skinny like a color strip
-                    fig, axs = plt.subplots(
-                        2,
-                        1,
-                        figsize=(w, h),
-                        sharex=True,
-                        gridspec_kw={"height_ratios": [15, 1], "hspace": 0.1},
-                    )
-
-                    split = np.sum(y == 0)  # number of rest samples
-                    axs[0].imshow(X.T, aspect="auto", origin="lower", cmap="seismic")
-                    axs[0].set_title("LE Embedded Features")
-                    axs[0].set_xlabel("Sample")
-                    axs[0].set_ylabel("Feature")
-
-                    axs[1].imshow(
-                        y[np.newaxis, :], aspect=20, origin="lower", cmap="seismic"
-                    )
-                    axs[1].set_title("Target")
-                    axs[1].set_xticks([])
-                    axs[1].set_yticks([])
-                    plt.savefig(
-                        f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
-                        dpi=150,
-                        bbox_inches="tight",
-                        pad_inches=0.2,
-                        format="png",
-                    )
-                    plt.close()
-
-                    # sort the data such that y=1 samples come first
-                    sort_indices = np.argsort(y, kind="stable")
-                    X = X[sort_indices]
-                    y = y[sort_indices]
-
-                    # plot X_embedded and y in an imshow as subplots
-                    # fig, axs = plt.subplots(2, 1, figsize=(w, h))
-                    # make bottom subplot skinny like a color strip
-                    fig, axs = plt.subplots(
-                        2,
-                        1,
-                        figsize=(w, h),
-                        sharex=True,
-                        gridspec_kw={"height_ratios": [15, 1], "hspace": 0.1},
-                    )
-
-                    axs[0].imshow(X.T, aspect="auto", origin="lower", cmap="seismic")
-                    axs[0].axvline(
-                        split - 0.5, color="k", lw=3
-                    )  # separator between rest/task
-                    axs[0].set_title("LE Embedded Features")
-                    axs[0].set_xlabel("Sample")
-                    axs[0].set_ylabel("Feature")
-
-                    axs[1].imshow(
-                        y[np.newaxis, :], aspect=20, origin="lower", cmap="seismic"
-                    )
-                    axs[1].set_title("Target")
-                    axs[1].set_xticks([])
-                    axs[1].set_yticks([])
-                    plt.savefig(
-                        f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
-                        dpi=150,
-                        bbox_inches="tight",
-                        pad_inches=0.2,
-                        format="png",
-                    )
-                    plt.close()
+                        # Apply the *same feature order* to test (no leakage from test):
+                        plot_samples_features(
+                            X,
+                            y,
+                            sample_order="label+cluster",  # clustering is per-split; that’s fine
+                            feature_order="tstat",  # we still show the t-bar for reference
+                            col_order_from_train=orders["col_order"],
+                            save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
+                            show=False,
+                        )
diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index 42ff2f3..9892bbd 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -6,6 +6,9 @@
 #SBATCH --time=05:00:00
 #SBATCH --mem=32G
 
+# Make sure logs dir exists (in case you submit from elsewhere)
+mkdir -p logs
+
 # === Activate virtual environment ===
 source "/home/mt00/venvs/pydfc/bin/activate"
 
@@ -34,13 +37,20 @@ if [ ! -f "$SCRIPT_PATH" ]; then
     exit 1
 fi
 
+# === Per-script memory selection ===
+if [ "$SCRIPT_NAME" = "sample_matrix_visualization.py" ]; then
+    MEM_FLAG="--mem=64G"
+else
+    MEM_FLAG="--mem=32G"
+fi
+
 # === Run based on script name ===
 case "$SCRIPT_NAME" in
     ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
-        python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
+        srun $MEM_FLAG python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
         ;;
     cohensd.py)
-        python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO"
+        srun $MEM_FLAG python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO"
         ;;
     *)
         echo "Unknown script: $SCRIPT_NAME"

From d302a41244c01d6e2c0825f29ec2bb4dd37e4ff5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 21 Oct 2025 23:04:13 -0400
Subject: [PATCH 267/401] minor

---
 .../multi_dataset_analysis/sample_matrix_visualization.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index e1c9261..2f5797f 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -13,8 +13,8 @@
 )
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from helper_functions import (
-    plot_samples_features,  # pyright: ignore[reportMissingImports]
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    plot_samples_features,
 )
 
 use_raw_features = False  # if True, use raw dFC features instead of embedded features

From 6af5cf52c0a68d8634a5555de5e6aad0476d1dab Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 22 Oct 2025 15:35:39 -0400
Subject: [PATCH 268/401] minor

---
 task_dFC/multi_dataset_analysis/cohensd.py    |  2 +-
 .../helper_functions.py                       | 44 ++++++++++++++++---
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index b9791a7..34ea7cb 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -349,7 +349,7 @@
 
     plt.xlabel("Max |Cohen's d| per Task", fontweight="bold", fontsize=14)
     plt.ylabel("SVM Balanced Accuracy", fontweight="bold", fontsize=14)
-    plt.legend(fontsize=12)
+    # plt.legend(fontsize=12)
     correlation = merged["abs_max"].corr(merged["score"])
     plt.text(
         0.05,
diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 5007d11..fcaa823 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -922,6 +922,17 @@ def figure_dfc_matrices_window_png(
 ###################### sample_matrix plots ######################
 
 
+def nice_step(n, max_ticks=10):
+    """Return a 'nice' step (1-2-5x10^k) to keep ≤ max_ticks across [1..n]."""
+    if n <= 1:
+        return 1
+    raw = max(1.0, n / max(2, (max_ticks - 1)))
+    exp = np.floor(np.log10(raw))
+    frac = raw / (10**exp)
+    base = 1 if frac <= 1 else 2 if frac <= 2 else 5 if frac <= 5 else 10
+    return int(base * (10**exp))
+
+
 def plot_samples_features(
     X,
     y,
@@ -930,7 +941,7 @@ def plot_samples_features(
     feature_order="original",  # "original" | "tstat"
     col_order_from_train=None,  # optional np.ndarray (feature indices) to reuse on test
     ZSCORE=True,
-    V_RANGE=2.0,
+    V_RANGE=None,
     cmap="coolwarm",
     title=None,
     save_path=None,
@@ -1018,20 +1029,39 @@ def order_rows(A):
     ax_main = fig.add_subplot(gs[0, 0])
     ax_lab = fig.add_subplot(gs[1, 0])
 
+    # --- VRANGE ---
+    if V_RANGE is None:
+        Xflat = np.asarray(Xz, float).ravel()
+        lo, hi = np.nanpercentile(Xflat, [5, 95])  # robust to outliers; tweak if needed
+        V_RANGE = max(abs(lo), abs(hi))  # symmetric around 0 (for diverging cmap)
+
     # ---------- main heatmap ----------
     img = Xz[row_order, :][:, col_order].T  # (features, samples)
     im = ax_main.imshow(
         img, aspect="auto", origin="lower", cmap=cmap, vmin=-V_RANGE, vmax=V_RANGE
     )
     n_features = img.shape[0]
+    last_idx = n_features - 1
+
     if n_features < 10:
-        yticks_step = 1
+        # every feature: labels 1..n, positions 0..n-1
+        labels_1based = np.arange(1, n_features + 1, dtype=int)
     else:
-        yticks_step = 2
-    ticks = np.arange(0, n_features, yticks_step, dtype=int)
-    ax_main.set_yticks(ticks)
-    labels = [str(int(t) + 1) for t in ticks]
-    ax_main.set_yticklabels(labels)
+        step = nice_step(n_features, max_ticks=10)
+        # use round multiples of the step
+        labels_1based = list(np.arange(step, n_features + 1, step, dtype=int))
+        # de-dup & sort (in case step == 1)
+        labels_1based = np.unique(labels_1based)
+
+    # convert 1-based labels to 0-based tick positions
+    ticks_pos = labels_1based - 1
+
+    # lock y-limits so the last tick isn't clipped
+    ax_main.set_ylim(-0.5, last_idx + 0.5)
+
+    # set ticks & labels
+    ax_main.set_yticks(ticks_pos)
+    ax_main.set_yticklabels([f"{v:d}" for v in labels_1based])
     ax_main.set_ylabel("feature", fontsize=12, fontweight="bold")
     ax_main.set_xlabel("sample", fontsize=12, fontweight="bold")
     ax_main.set_xticks([])

From f4d39ec8da66232ce6b57c34e1f1716ddbe1dd22 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 24 Oct 2025 12:41:58 -0400
Subject: [PATCH 269/401] improve run_across_dataset_analysis.sh

---
 .../run_across_dataset_analysis.sh            | 49 ++++++-------------
 1 file changed, 16 insertions(+), 33 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index 9892bbd..4407a60 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -5,58 +5,41 @@
 #SBATCH --error=logs/%x_err.txt
 #SBATCH --time=05:00:00
 #SBATCH --mem=32G
+#SBATCH --chdir=/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/multi_dataset_analysis/codes
 
-# Make sure logs dir exists (in case you submit from elsewhere)
-mkdir -p logs
+set -euo pipefail
 
-# === Activate virtual environment ===
+mkdir -p logs
 source "/home/mt00/venvs/pydfc/bin/activate"
 
-# === Global variable ===
 MULTI_DATASET_INFO="/home/mt00/pydfc/dFC/task_dFC/run_scripts_slurm/multi_dataset_info.json"
 
-# === Arguments ===
-SCRIPT_NAME=$1        # e.g., ml_results.py
-SIMUL_OR_REAL=$2      # e.g., real or simulated
+SCRIPT_NAME=${1:-}
+SIMUL_OR_REAL=${2:-real}
 SCRIPT_DIR="/home/mt00/pydfc/dFC/task_dFC/multi_dataset_analysis"
+SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_NAME"
 
-# === Safety checks ===
 if [ -z "$SCRIPT_NAME" ]; then
     echo "Usage: sbatch run_analysis.sh <script_name> [real|simulated]"
     exit 1
 fi
 
-if [ -z "$SIMUL_OR_REAL" ]; then
-    SIMUL_OR_REAL="real"  # default
-fi
-
-SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_NAME"
-
 if [ ! -f "$SCRIPT_PATH" ]; then
     echo "Error: Script '$SCRIPT_PATH' not found."
     exit 1
 fi
 
-# === Per-script memory selection ===
-if [ "$SCRIPT_NAME" = "sample_matrix_visualization.py" ]; then
-    MEM_FLAG="--mem=64G"
-else
-    MEM_FLAG="--mem=32G"
-fi
-
-# === Run based on script name ===
 case "$SCRIPT_NAME" in
-    ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
-        srun $MEM_FLAG python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
-        ;;
-    cohensd.py)
-        srun $MEM_FLAG python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO"
-        ;;
-    *)
-        echo "Unknown script: $SCRIPT_NAME"
-        exit 1
-        ;;
+  ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
+    python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
+    ;;
+  cohensd.py)
+    python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO"
+    ;;
+  *)
+    echo "Unknown script: $SCRIPT_NAME"
+    exit 1
+    ;;
 esac
 
-# === Deactivate virtual environment ===
 deactivate

From 5132e4437c792191536c04b97b19b672b99e2cf5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 31 Oct 2025 13:25:24 -0400
Subject: [PATCH 270/401] modify across run heatmap

---
 task_dFC/multi_dataset_analysis/ml_results.py | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 9c7ee9b..96b0293 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -447,7 +447,7 @@
         )
         plt.close(fig)
 
-        # ACROSS heatmap: color = median; annotation = Stability & n (ALWAYS range-based)
+        # ACROSS heatmap: color = median; annotation = min–max & n (across runs)
         if df_multi.empty:
             print(
                 f"[ACROSS-RUN] No tasks with ≥2 runs for {embedding} / {metric} — skipping across-run figures."
@@ -460,21 +460,22 @@
                 .reset_index()
             )
 
-            # metric bounds & heatmap scaling
+            # heatmap scaling (avoid name clash with s['vmin'] / s['vmax'])
             if metric == "SI":
-                rng = 2.0  # SI in [-1, 1]
-                vmin, vmax, center = None, 1.0, 0.0
+                cmin, cmax, ccenter = None, 1.0, 0.0  # SI in [-1,1], center at 0
             else:
-                rng = 1.0  # accuracies in [0, 1]
-                vmin, vmax, center = 0.5 - 1e-6, 1.0, 0.5
-
-            # ALWAYS: range-based stability
-            s["stability"] = (1.0 - ((s["vmax"] - s["vmin"]) / rng)).clip(0.0, 1.0)
+                cmin, cmax, ccenter = (
+                    0.5 - 1e-6,
+                    1.0,
+                    0.5,
+                )  # accuracy in [0.5,1], center at chance
 
             # pivots
             mat_across = s.pivot(index="task", columns="dFC method", values="med")
             ann_text = s.assign(
-                label=lambda d: d["stability"].map(lambda v: f"{v:.2f}")
+                label=lambda d: d["vmin"].map(lambda v: f"{v:.2f}")
+                + "\u2013"
+                + d["vmax"].map(lambda v: f"{v:.2f}")
                 + "\n"
                 + d["n"].map(lambda n: f"n={n}")
             ).pivot(index="task", columns="dFC method", values="label")
@@ -491,9 +492,9 @@
             fig, ax = plt.subplots(figsize=(w, h))
             hm = sns.heatmap(
                 mat_across.loc[row_order, col_order],
-                vmin=vmin,
-                vmax=vmax,
-                center=center,
+                vmin=cmin,
+                vmax=cmax,
+                center=ccenter,
                 cmap="coolwarm",
                 annot=ann_text.loc[row_order, col_order],
                 fmt="",
@@ -507,10 +508,11 @@
                 task_to_domain = {t: task_domain_real(t) for t in row_order}
                 domain_x_frac = -0.8
                 ylabel_pad_pts = 130
-            elif simul_or_real == "simulated":
+            else:  # "simulated"
                 task_to_domain = {t: task_domain_simul(t) for t in row_order}
                 domain_x_frac = -1.0
                 ylabel_pad_pts = 110
+
             add_domains_between_ylabel_and_ticks(
                 ax,
                 row_order=row_order,

From 72e04326ee8118bd75c0fa81510c0b22df898f79 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 31 Oct 2025 13:37:13 -0400
Subject: [PATCH 271/401] minor

---
 task_dFC/multi_dataset_analysis/ml_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 96b0293..f83d95c 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -487,7 +487,7 @@
             col_order = [m for m in method_order if m in mat_across.columns]
 
             # plot
-            w = max(9.0, 9 / 7 * len(col_order))
+            w = max(9.0, 20 / 7 * len(col_order))
             h = max(7.0, 7 / 20 * len(row_order))
             fig, ax = plt.subplots(figsize=(w, h))
             hm = sns.heatmap(

From 5ba0c8e16cab6652156ab46a0d320bc54cc3512f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 31 Oct 2025 16:40:30 -0400
Subject: [PATCH 272/401] minor

---
 task_dFC/multi_dataset_analysis/ml_results.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index f83d95c..e2bf7eb 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -487,7 +487,7 @@
             col_order = [m for m in method_order if m in mat_across.columns]
 
             # plot
-            w = max(9.0, 20 / 7 * len(col_order))
+            w = max(9.0, 11 / 7 * len(col_order))
             h = max(7.0, 7 / 20 * len(row_order))
             fig, ax = plt.subplots(figsize=(w, h))
             hm = sns.heatmap(
@@ -506,12 +506,12 @@
             # domain sidebar & separators (your helper)
             if simul_or_real == "real":
                 task_to_domain = {t: task_domain_real(t) for t in row_order}
-                domain_x_frac = -0.8
-                ylabel_pad_pts = 130
+                domain_x_frac = -0.5
+                ylabel_pad_pts = 160
             else:  # "simulated"
                 task_to_domain = {t: task_domain_simul(t) for t in row_order}
-                domain_x_frac = -1.0
-                ylabel_pad_pts = 110
+                domain_x_frac = -0.6
+                ylabel_pad_pts = 140
 
             add_domains_between_ylabel_and_ticks(
                 ax,

From 510368131f135aa1175ba51fd30af8b90dfaae0e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 9 Nov 2025 23:47:57 -0500
Subject: [PATCH 273/401] bug in ml_results

---
 task_dFC/multi_dataset_analysis/ml_results.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index e2bf7eb..4bae943 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -122,18 +122,12 @@
                     for key in ML_scores_new[level].keys()
                     if key not in keys_not_to_include
                 }
-                for task in TASKS:
-                    if task not in TASKS_to_include:
+                for i in range(len(ML_scores_new[level]["task"])):
+                    if ML_scores_new[level]["task"][i] not in TASKS_to_include:
                         continue
-                    if task not in ML_scores_new[level]["task"]:
-                        dFC_method = set(ML_scores_new[level]["dFC method"])
-                        print(f"Task {task} not in ML_scores of {dFC_method}. Skipping.")
-                        continue
-                    for i in range(len(ML_scores_new[level]["task"])):
-                        for key in ML_scores_new_updated.keys():
-                            ML_scores_new_updated[key].append(
-                                ML_scores_new[level][key][i]
-                            )
+
+                    for key in ML_scores_new_updated.keys():
+                        ML_scores_new_updated[key].append(ML_scores_new[level][key][i])
 
                 if ALL_ML_SCORES is None:
                     ALL_ML_SCORES = ML_scores_new_updated

From daac328329689b0135df3bf8f9dffe881e6b994f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 10 Nov 2025 00:01:00 -0500
Subject: [PATCH 274/401] add performance_predict

---
 .../performance_predict.py                    | 429 ++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 task_dFC/multi_dataset_analysis/performance_predict.py

diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
new file mode 100644
index 0000000..dbc1989
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -0,0 +1,429 @@
+import argparse
+import json
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from pydfc import data_loader
+from pydfc.data_loader import find_subj_list
+from pydfc.ml_utils import find_available_subjects, load_task_data
+from pydfc.task_utils import (
+    calc_relative_task_on,
+    calc_rest_duration,
+    calc_task_duration,
+    calc_transition_freq,
+    cohen_d_bold,
+    extract_task_presence,
+)
+
+fig_bbox_inches = "tight"
+fig_pad = 0.1
+show_title = False
+save_fig_format = "png"  # pdf, png,
+
+level = "group_lvl"
+keys_not_to_include = [
+    "Logistic regression permutation p_value",
+    "Logistic regression permutation score mean",
+    "Logistic regression permutation score std",
+    "SVM permutation p_value",
+    "SVM permutation score mean",
+    "SVM permutation score std",
+]
+
+#######################################################################################
+
+if __name__ == "__main__":
+    # argparse
+    HELPTEXT = """
+    Script to predict performance based on task design features and BOLD signals across multiple datasets.
+    """
+
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument(
+        "--multi_dataset_info", type=str, help="path to multi-dataset info file"
+    )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
+
+    args = parser.parse_args()
+
+    multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
+
+    # Read dataset info
+    with open(multi_dataset_info, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+    output_root = (
+        f"{multi_dataset_info['output_root']}/performance_predictor/{simul_or_real}"
+    )
+
+    if not os.path.exists(output_root):
+        os.makedirs(output_root)
+
+    task_ratio_all = {}
+    transition_freq_all = {}
+    rest_durations_all = {}
+    task_durations_all = {}
+    for dataset in DATASETS:
+
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for session in SESSIONS:
+            for task_id, task in enumerate(TASKS):
+                if not task in TASKS_to_include:
+                    continue
+                for run in RUNS[task]:
+                    SUBJECTS = find_subj_list(roi_root)
+                    # print(f"Number of subjects: {len(SUBJECTS)}")
+
+                    for subj in SUBJECTS:
+
+                        try:
+                            task_data = load_task_data(
+                                roi_root=roi_root,
+                                subj=subj,
+                                task=task,
+                                run=run,
+                                session=session,
+                            )
+                        except FileNotFoundError:
+                            continue
+
+                        task_presence, indices = extract_task_presence(
+                            event_labels=task_data["event_labels"],
+                            TR_task=1 / task_data["Fs_task"],
+                            TR_mri=task_data["TR_mri"],
+                            binary=True,
+                            binarizing_method="GMM",
+                            no_hrf=False,
+                        )
+
+                        relative_task_on = calc_relative_task_on(task_presence[indices])
+                        num_of_transitions, relative_transition_freq = (
+                            calc_transition_freq(task_presence[indices])
+                        )
+                        # calculate rest and task durations based original event labels
+                        event_labels = np.multiply(task_data["event_labels"] != 0, 1)
+                        rest_durations = calc_rest_duration(
+                            event_labels, TR_mri=1 / task_data["Fs_task"]
+                        )
+                        task_durations = calc_task_duration(
+                            event_labels, TR_mri=1 / task_data["Fs_task"]
+                        )
+
+                        if not task in task_ratio_all:
+                            task_ratio_all[task] = []
+                        if not task in transition_freq_all:
+                            transition_freq_all[task] = []
+                        if not task in rest_durations_all:
+                            rest_durations_all[task] = []
+                        if not task in task_durations_all:
+                            task_durations_all[task] = []
+                        task_ratio_all[task].append(relative_task_on)
+                        transition_freq_all[task].append(relative_transition_freq)
+                        # rest_durations and task_durations are lists
+                        rest_durations_all[task].extend(rest_durations)
+                        task_durations_all[task].extend(task_durations)
+
+    task_design_features = {
+        "task_ratio_all": task_ratio_all,
+        "transition_freq_all": transition_freq_all,
+        "rest_durations_all": rest_durations_all,
+        "task_durations_all": task_durations_all,
+    }
+
+    CohensD_across_task = {}
+    for dataset in DATASETS:
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        for task in TASKS:
+            if task not in TASKS_to_include:
+                print(f"Skipping task {task} as it's not in the inclusion list.")
+                continue
+            d_values_all = []
+            for session in SESSIONS:
+                print(f"Processing task: {task}")
+                SUBJECTS = find_available_subjects(
+                    dFC_root=dFC_root,
+                    task=task,
+                    dFC_id=None,
+                    session=session,
+                )
+                for subj in SUBJECTS:
+                    for run in RUNS[task]:
+                        try:
+                            task_data = load_task_data(
+                                roi_root=roi_root,
+                                subj=subj,
+                                task=task,
+                                run=run,
+                                session=session,
+                            )
+                        except:
+                            continue
+
+                        if run is None:
+                            if session is None:
+                                BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+                            else:
+                                BOLD_file_name = (
+                                    "{subj_id}_{session}_{task}_time-series.npy"
+                                )
+                        else:
+                            if session is None:
+                                BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+                            else:
+                                BOLD_file_name = (
+                                    "{subj_id}_{session}_{task}_{run}_time-series.npy"
+                                )
+                        try:
+                            BOLD = data_loader.load_TS(
+                                data_root=roi_root,
+                                file_name=BOLD_file_name,
+                                subj_id2load=subj,
+                                task=task,
+                                session=session,
+                                run=run,
+                            )
+                        except Exception as e:
+                            print(f"Error loading BOLD data: {e}")
+                            continue
+                        BOLD_data = BOLD.data  # np.ndarray (n_ROIs, n_TRs)
+
+                        Fs_task = task_data["Fs_task"]
+                        TR_task = 1 / Fs_task
+
+                        TR_array = np.arange(0, BOLD_data.shape[1])
+                        task_presence, indices = extract_task_presence(
+                            event_labels=task_data["event_labels"],
+                            TR_task=TR_task,
+                            TR_mri=task_data["TR_mri"],
+                            binary=True,
+                            binarizing_method="GMM",
+                            no_hrf=False,
+                            TR_array=TR_array,
+                        )
+
+                        # if n_TRs do not match, align them
+                        if BOLD_data.shape[1] != task_presence.shape[0]:
+                            print(
+                                f"Before alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                            )
+                            min_TRs = min(BOLD_data.shape[1], task_presence.shape[0])
+                            task_presence = task_presence[:min_TRs]
+                            BOLD_data = BOLD_data[:, :min_TRs]
+                            print(
+                                f"After alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                            )
+                            # also adjust indices
+                            indices = [i for i in indices if i < min_TRs]
+                        task_presence = task_presence[indices]  # (n_TRs,)
+                        BOLD_data = BOLD_data[:, indices]  # (n_ROIs, n_TRs)
+
+                        assert BOLD_data.shape[1] == task_presence.shape[0]
+
+                        cohen_d = cohen_d_bold(X=BOLD_data.T, y=task_presence)
+                        d_values_all.append(cohen_d)
+
+            if len(d_values_all) == 0:
+                print(f"No data found for task {task} in dataset {dataset}. Skipping.")
+                continue
+            d_values_all = np.array(d_values_all)  # (n_subjectsxrunsxsessions, n_ROIs)
+            avg_d_values = np.nanmean(d_values_all, axis=0)  # (n_ROIs,)
+            if not task in CohensD_across_task:
+                CohensD_across_task[task] = []
+            CohensD_across_task[task].extend(avg_d_values)
+
+    ALL_ML_SCORES = None
+    for dataset in DATASETS:
+        print(f"Processing dataset: {dataset}")
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        ML_root = f"{main_root}/{dataset}/derivatives/ML"
+
+        # Read dataset info
+        with open(dataset_info_file, "r") as f:
+            dataset_info = json.load(f)
+
+        if "SESSIONS" in dataset_info:
+            SESSIONS = dataset_info["SESSIONS"]
+        else:
+            SESSIONS = None
+        if SESSIONS is None:
+            SESSIONS = [None]
+
+        TASKS = dataset_info["TASKS"]
+
+        if "RUNS" in dataset_info:
+            RUNS = dataset_info["RUNS"]
+        else:
+            RUNS = None
+        if RUNS is None:
+            RUNS = {task: [None] for task in TASKS}
+
+        # find all ML_scores_classify_dFC-id.npy in the ML_root/classfication/ folder
+        # for now we will only use the first session
+        session = SESSIONS[0]
+        if session is None:
+            input_dir = f"{ML_root}/classification"
+        else:
+            input_dir = f"{ML_root}/classification/{session}"
+        if not os.path.exists(input_dir):
+            print(
+                f"Input directory {input_dir} does not exist. Skipping dataset {dataset}."
+            )
+            continue
+        ALL_ML_SCORES_FILES = os.listdir(input_dir)
+        ALL_ML_SCORES_FILES = [
+            f for f in ALL_ML_SCORES_FILES if "ML_scores_classify_" in f
+        ]
+        for f in ALL_ML_SCORES_FILES:
+            try:
+                ML_scores_new = np.load(f"{input_dir}/{f}", allow_pickle=True).item()
+                # ML_scores_new_updated is a new dictionary with same keys as ML_scores_new but empty lists
+                ML_scores_new_updated = {
+                    key: []
+                    for key in ML_scores_new[level].keys()
+                    if key not in keys_not_to_include
+                }
+                for i in range(len(ML_scores_new[level]["task"])):
+                    if task not in TASKS_to_include:
+                        continue
+
+                    for key in ML_scores_new_updated.keys():
+                        ML_scores_new_updated[key].append(ML_scores_new[level][key][i])
+
+                if ALL_ML_SCORES is None:
+                    ALL_ML_SCORES = ML_scores_new_updated
+                else:
+                    for key in ML_scores_new_updated.keys():
+                        if key in ALL_ML_SCORES:
+                            ALL_ML_SCORES[key].extend(ML_scores_new_updated[key])
+            except Exception as e:
+                print(f"Error loading {f}: {e}")
+                continue
+
+    # check that the lists in all keys have the same length
+    if ALL_ML_SCORES is not None:
+        lengths = [len(v) for v in ALL_ML_SCORES.values()]
+        if len(set(lengths)) != 1:
+            print(
+                f"Warning: Not all keys have the same length in ALL_ML_SCORES. key and length pairs: {dict(zip(ALL_ML_SCORES.keys(), lengths))}"
+            )
+
+    embedding = "LE"
+    metric = "SVM balanced accuracy"
+    GROUP = "test"
+
+    all_scores = {}
+    for i in range(len(ALL_ML_SCORES["task"])):
+        if (
+            ALL_ML_SCORES["embedding"][i] == embedding
+            and ALL_ML_SCORES["group"][i] == GROUP
+        ):
+
+            if ALL_ML_SCORES["task"][i] not in all_scores:
+                all_scores[ALL_ML_SCORES["task"][i]] = []
+            all_scores[ALL_ML_SCORES["task"][i]].append(ALL_ML_SCORES[metric][i])
+
+    # all_scores is a list of scores across methods and runs
+    all_scores = {k: np.array(v) for k, v in all_scores.items()}
+
+    # we have task design features in task_design_features[task_ratio_all][task], task_design_features[transition_freq_all][task], task_design_features[rest_durations_all][task], task_design_features[task_durations_all][task]
+    # we have CohensD in CohensD_across_task[task]
+    # we have ML scores in all_scores[task]
+
+    DATA = {
+        "task": [],
+        "task_ratio": [],
+        "transition_freq": [],
+        "rest_durations_mean": [],
+        "task_durations_mean": [],
+        "rest_durations_std": [],
+        "task_durations_std": [],
+        "cohen_d_max": [],
+        "classfication_score_mean": [],
+    }
+    for task in TASKS_to_include:
+        task_ratio = np.mean(task_design_features["task_ratio_all"][task])
+        transition_freq = np.mean(task_design_features["transition_freq_all"][task])
+        rest_durations_mean = np.mean(task_design_features["rest_durations_all"][task])
+        task_durations_mean = np.mean(task_design_features["task_durations_all"][task])
+        rest_durations_std = np.std(task_design_features["rest_durations_all"][task])
+        task_durations_std = np.std(task_design_features["task_durations_all"][task])
+        cohen_d_max = np.max(np.abs(CohensD_across_task[task]))
+        classfication_score_mean = np.mean(all_scores[task])
+
+        DATA["task"].append(task)
+        DATA["task_ratio"].append(task_ratio)
+        DATA["transition_freq"].append(transition_freq)
+        DATA["rest_durations_mean"].append(rest_durations_mean)
+        DATA["task_durations_mean"].append(task_durations_mean)
+        DATA["rest_durations_std"].append(rest_durations_std)
+        DATA["task_durations_std"].append(task_durations_std)
+        DATA["cohen_d_max"].append(cohen_d_max)
+        DATA["classfication_score_mean"].append(classfication_score_mean)
+
+    # save DATA
+    np.save(f"{output_root}/performance_predictor_data.npy", DATA)

From 9e0154ac44e33c4674fcf8235f851dfa876ca7ef Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 10 Nov 2025 00:05:39 -0500
Subject: [PATCH 275/401] minor

---
 task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index 4407a60..3bd6a4d 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -30,7 +30,7 @@ if [ ! -f "$SCRIPT_PATH" ]; then
 fi
 
 case "$SCRIPT_NAME" in
-  ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
+  performance_predict.py | ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
     python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
     ;;
   cohensd.py)

From bd3a8d6249dc142da7842591892b864037fe3abf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Nov 2025 11:58:10 -0500
Subject: [PATCH 276/401] add readme for task dFC

---
 task_dFC/README.rst | 89 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 task_dFC/README.rst

diff --git a/task_dFC/README.rst b/task_dFC/README.rst
new file mode 100644
index 0000000..c65909b
--- /dev/null
+++ b/task_dFC/README.rst
@@ -0,0 +1,89 @@
+This document provides documentation for the `task_dFC` module of the PydFC toolbox, presented in reStructuredText (.rst) format. This module implements the analytical pipeline described in the associated manuscript, "From Rest to Task: Tracking Moment-to-Moment Cognitive State Using Dynamic Functional Connectivity".
+
+***
+
+```rst
+.. raw:: html
+
+   <a href="https://github.com/neurodatascience/dFC"><img src="https://img.shields.io/badge/GitHub-neurodatascience%2FdFC-blue.svg" alt="GitHub Repository"></a>
+
+=======================================================
+PydFC: task_dFC Module Documentation
+=======================================================
+
+The ``task_dFC`` module provides a scalable, open-source Python solution for the **large-scale benchmarking and application of dynamic functional connectivity (dFC) methods**.
+
+Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting moment-to-moment cognitive states**—specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
+
+Methods Implemented
+-------------------
+
+The module supports a diverse selection of seven well-established dFC methodologies implemented within the PydFC toolbox :
+
+*   **State-free Methods:** Designed to capture continuous fluctuations in connectivity.
+    *   Sliding Window (SW) .
+    *   Time-Frequency (TF) .
+
+*   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states.
+    *   Co-Activation Patterns (CAP) .
+    *   Clustering .
+    *   Continuous Hidden Markov Models (HMM) .
+    *   Discrete Hidden Markov Models (HMM) .
+    *   Windowless (WL) .
+
+Analysis Pipeline: Script-Based Workflow
+---------------------------------------
+
+The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``events.tsv``) has undergone standard preprocessing and denoising (e.g., via fMRIprep and Nipoppy) . The subsequent analysis is executed sequentially through the following scripts:
+
+1. ``nifti_to_roi_signal.py``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Function:** Extracts regional BOLD time series from preprocessed NIfTI data .
+**Details:** Voxel-wise BOLD signals are parcellated, typically using an atlas such as the Schaefer 100-region atlas, yielding regional time series that serve as the input for dFC assessment.
+
+2. ``FCS_estimate.py``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Function:** Estimates Functional Connectivity States (FCS) .
+**Details:** This script fits the dFC model required by **state-based methodologies** (CAP, HMM, Clustering) that rely on identifying **group-level recurring patterns** . The number of brain states for these methods is typically set to five .
+
+3. ``dFC_assessment.py``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Function:** Computes time-resolved dFC patterns .
+**Details:** The script applies the seven implemented dFC methodologies (SW, TF, CAP, etc.) to the BOLD signals of each run and subject to obtain the corresponding high-dimensional dFC patterns .
+
+4. ``ML.py``
+~~~~~~~~~~~~~~~~~~~~
+**Function:** Implements the core machine learning pipeline, including cognitive state labeling, feature extraction, supervised classification, and separability analysis .
+
+**A. Task Presence Labeling**
+*   Initial stimulus timings from ``events.tsv`` are convolved with a canonical **Hemodynamic Response Function (HRF)** to account for hemodynamic delay .
+*   The HRF-convolved signal is binarized using a **Gaussian Mixture Model (GMM)** to assign time points as "rest" or "task-present" . This process critically identifies and removes ambiguous **"gray zone" time points** corresponding to transitions, improving classifier performance .
+
+**B. Feature Extraction and Reduction**
+*   **State-free Methods (SW, TF):** DFC matrices are vectorized (e.g., 4950 connections) . **Laplacian Eigenmaps (LE)** dimensionality reduction is applied to make the high-dimensional discriminative information accessible to classifiers, as raw dFC features often yield near-chance accuracy otherwise .
+*   **State-based Methods (CAP, HMM, etc.):** Features are derived from state probabilities, distances from states, or state weights . These resulting compositional features (shape (time, 5)) are transformed using an **isometric log-ratio (ILR) transformation** to yield 4 features per time point .
+
+**C. Prediction and Evaluation**
+*   A **Support Vector Machine (SVM) with an RBF kernel** is trained to predict the cognitive state (rest vs. task) at the single-TR level .
+*   **Balanced Accuracy** is used as the primary metric, ensuring chance performance is 50% .
+*   **Cognitive State Separability (CSS)** is quantified using the **Silhouette Index (SI)** to evaluate whether task and rest samples are intrinsically distinguishable in the feature space without supervision .
+
+5. ``generate_report.py``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Function:** Summarizes classification efficacy and separability results .
+**Details:** Generates figures, tables, and reports (e.g., heatmaps and boxplots) documenting Balanced Accuracy and SI scores across methods and paradigms.
+
+6. ``multi_dataset_analysis``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Function:** Contains scripts for aggregating and comparing results across multiple datasets and paradigms.
+**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics (e.g., median accuracy) across the 17 datasets and 29 task paradigms analyzed.
+
+Key Findings from Benchmarking
+------------------------------
+
+Based on the analysis implemented via the ``task_dFC`` pipeline across real datasets:
+
+*   **Time-Frequency** is often the most reliable method, consistently achieving high mean and median accuracies across paradigms, while **Sliding Window** exhibits the most variability but can achieve near-perfect accuracy (e.g., for the localiser task) .
+*   The use of **Laplacian Eigenmaps (LE)** is essential for the high predictive accuracy of state-free methods, as PCA-transformed features often yield chance-level accuracies.
+*   Cognitive states in real data generally show **low intrinsic separability (low SI)** . This underscores that **supervised classifiers (SVM)** are necessary to leverage the subtle high-dimensional dFC structure and achieve accurate prediction .
+```

From 6129d923481df6e70ef418aebb9d9160adabae56 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Nov 2025 12:01:26 -0500
Subject: [PATCH 277/401] minor

---
 task_dFC/README.rst | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index c65909b..df63075 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -1,8 +1,3 @@
-This document provides documentation for the `task_dFC` module of the PydFC toolbox, presented in reStructuredText (.rst) format. This module implements the analytical pipeline described in the associated manuscript, "From Rest to Task: Tracking Moment-to-Moment Cognitive State Using Dynamic Functional Connectivity".
-
-***
-
-```rst
 .. raw:: html
 
    <a href="https://github.com/neurodatascience/dFC"><img src="https://img.shields.io/badge/GitHub-neurodatascience%2FdFC-blue.svg" alt="GitHub Repository"></a>
@@ -41,17 +36,17 @@ The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``
 **Function:** Extracts regional BOLD time series from preprocessed NIfTI data .
 **Details:** Voxel-wise BOLD signals are parcellated, typically using an atlas such as the Schaefer 100-region atlas, yielding regional time series that serve as the input for dFC assessment.
 
-2. ``FCS_estimate.py``
+1. ``FCS_estimate.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **Function:** Estimates Functional Connectivity States (FCS) .
 **Details:** This script fits the dFC model required by **state-based methodologies** (CAP, HMM, Clustering) that rely on identifying **group-level recurring patterns** . The number of brain states for these methods is typically set to five .
 
-3. ``dFC_assessment.py``
+1. ``dFC_assessment.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **Function:** Computes time-resolved dFC patterns .
 **Details:** The script applies the seven implemented dFC methodologies (SW, TF, CAP, etc.) to the BOLD signals of each run and subject to obtain the corresponding high-dimensional dFC patterns .
 
-4. ``ML.py``
+1. ``ML.py``
 ~~~~~~~~~~~~~~~~~~~~
 **Function:** Implements the core machine learning pipeline, including cognitive state labeling, feature extraction, supervised classification, and separability analysis .
 
@@ -86,4 +81,3 @@ Based on the analysis implemented via the ``task_dFC`` pipeline across real data
 *   **Time-Frequency** is often the most reliable method, consistently achieving high mean and median accuracies across paradigms, while **Sliding Window** exhibits the most variability but can achieve near-perfect accuracy (e.g., for the localiser task) .
 *   The use of **Laplacian Eigenmaps (LE)** is essential for the high predictive accuracy of state-free methods, as PCA-transformed features often yield chance-level accuracies.
 *   Cognitive states in real data generally show **low intrinsic separability (low SI)** . This underscores that **supervised classifiers (SVM)** are necessary to leverage the subtle high-dimensional dFC structure and achieve accurate prediction .
-```

From 34d3d47138cfd050b39be5061153c5c7d375ab40 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Nov 2025 12:26:50 -0500
Subject: [PATCH 278/401] minor editing

---
 task_dFC/README.rst | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index df63075..3562b76 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -8,7 +8,7 @@ PydFC: task_dFC Module Documentation
 
 The ``task_dFC`` module provides a scalable, open-source Python solution for the **large-scale benchmarking and application of dynamic functional connectivity (dFC) methods**.
 
-Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting moment-to-moment cognitive states**—specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
+Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting moment-to-moment cognitive states**-specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
 
 Methods Implemented
 -------------------
@@ -21,29 +21,32 @@ The module supports a diverse selection of seven well-established dFC methodolog
 
 *   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states.
     *   Co-Activation Patterns (CAP) .
-    *   Clustering .
-    *   Continuous Hidden Markov Models (HMM) .
-    *   Discrete Hidden Markov Models (HMM) .
+    *   Clustering (SWC) .
+    *   Continuous Hidden Markov Models (CHMM) .
+    *   Discrete Hidden Markov Models (DHMM) .
     *   Windowless (WL) .
 
 Analysis Pipeline: Script-Based Workflow
 ---------------------------------------
 
-The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``events.tsv``) has undergone standard preprocessing and denoising (e.g., via fMRIprep and Nipoppy) . The subsequent analysis is executed sequentially through the following scripts:
+The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``events.tsv``) has undergone standard preprocessing (via fMRIprep) . The subsequent analysis is executed sequentially through the following scripts:
 
 1. ``nifti_to_roi_signal.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Extracts regional BOLD time series from preprocessed NIfTI data .
+**Function:** Runs denoising and extracts regional BOLD time series from preprocessed NIfTI data .
+
 **Details:** Voxel-wise BOLD signals are parcellated, typically using an atlas such as the Schaefer 100-region atlas, yielding regional time series that serve as the input for dFC assessment.
 
 1. ``FCS_estimate.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **Function:** Estimates Functional Connectivity States (FCS) .
-**Details:** This script fits the dFC model required by **state-based methodologies** (CAP, HMM, Clustering) that rely on identifying **group-level recurring patterns** . The number of brain states for these methods is typically set to five .
+
+**Details:** This script fits the dFC model required by **state-based methodologies** (CAP, HMM, Clustering) that rely on identifying **group-level recurring patterns** .
 
 1. ``dFC_assessment.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Computes time-resolved dFC patterns .
+**Function:** Computes individual-level dFC patterns .
+
 **Details:** The script applies the seven implemented dFC methodologies (SW, TF, CAP, etc.) to the BOLD signals of each run and subject to obtain the corresponding high-dimensional dFC patterns .
 
 1. ``ML.py``
@@ -55,29 +58,22 @@ The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``
 *   The HRF-convolved signal is binarized using a **Gaussian Mixture Model (GMM)** to assign time points as "rest" or "task-present" . This process critically identifies and removes ambiguous **"gray zone" time points** corresponding to transitions, improving classifier performance .
 
 **B. Feature Extraction and Reduction**
-*   **State-free Methods (SW, TF):** DFC matrices are vectorized (e.g., 4950 connections) . **Laplacian Eigenmaps (LE)** dimensionality reduction is applied to make the high-dimensional discriminative information accessible to classifiers, as raw dFC features often yield near-chance accuracy otherwise .
-*   **State-based Methods (CAP, HMM, etc.):** Features are derived from state probabilities, distances from states, or state weights . These resulting compositional features (shape (time, 5)) are transformed using an **isometric log-ratio (ILR) transformation** to yield 4 features per time point .
+*   **State-free Methods (SW, TF):** DFC matrices are vectorized (e.g., 4950 connections for Schaefer 100-region atlas) . **Laplacian Eigenmaps (LE)** dimensionality reduction is applied to make the high-dimensional discriminative information accessible to classifiers .
+*   **State-based Methods (CAP, HMM, etc.):** Features are derived from state probabilities, distances from states, or state weights . These resulting compositional features (shape (time, states)) are transformed using an **isometric log-ratio (ILR) transformation** .
 
 **C. Prediction and Evaluation**
 *   A **Support Vector Machine (SVM) with an RBF kernel** is trained to predict the cognitive state (rest vs. task) at the single-TR level .
 *   **Balanced Accuracy** is used as the primary metric, ensuring chance performance is 50% .
-*   **Cognitive State Separability (CSS)** is quantified using the **Silhouette Index (SI)** to evaluate whether task and rest samples are intrinsically distinguishable in the feature space without supervision .
+*   **Cognitive State Separability** is quantified using the **Silhouette Index (SI)** to evaluate whether task and rest samples are intrinsically distinguishable in the feature space without supervision .
 
-5. ``generate_report.py``
+1. ``generate_report.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Summarizes classification efficacy and separability results .
+**Function:** Summarizes classification efficacy and separability results for individual datasets and paradigms .
+
 **Details:** Generates figures, tables, and reports (e.g., heatmaps and boxplots) documenting Balanced Accuracy and SI scores across methods and paradigms.
 
 6. ``multi_dataset_analysis``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **Function:** Contains scripts for aggregating and comparing results across multiple datasets and paradigms.
-**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics (e.g., median accuracy) across the 17 datasets and 29 task paradigms analyzed.
-
-Key Findings from Benchmarking
-------------------------------
-
-Based on the analysis implemented via the ``task_dFC`` pipeline across real datasets:
 
-*   **Time-Frequency** is often the most reliable method, consistently achieving high mean and median accuracies across paradigms, while **Sliding Window** exhibits the most variability but can achieve near-perfect accuracy (e.g., for the localiser task) .
-*   The use of **Laplacian Eigenmaps (LE)** is essential for the high predictive accuracy of state-free methods, as PCA-transformed features often yield chance-level accuracies.
-*   Cognitive states in real data generally show **low intrinsic separability (low SI)** . This underscores that **supervised classifiers (SVM)** are necessary to leverage the subtle high-dimensional dFC structure and achieve accurate prediction .
+**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics (e.g., accuracy distribution) across the 17 datasets and 29 task paradigms analyzed.

From ea9047ec24549e922b760a92502c13a1f048c7a1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Nov 2025 12:31:07 -0500
Subject: [PATCH 279/401] minor

---
 task_dFC/README.rst | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index 3562b76..cba50ef 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -15,16 +15,18 @@ Methods Implemented
 
 The module supports a diverse selection of seven well-established dFC methodologies implemented within the PydFC toolbox :
 
-*   **State-free Methods:** Designed to capture continuous fluctuations in connectivity.
-    *   Sliding Window (SW) .
-    *   Time-Frequency (TF) .
-
-*   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states.
-    *   Co-Activation Patterns (CAP) .
-    *   Clustering (SWC) .
-    *   Continuous Hidden Markov Models (CHMM) .
-    *   Discrete Hidden Markov Models (DHMM) .
-    *   Windowless (WL) .
+*   **State-free Methods:** Designed to capture continuous fluctuations in connectivity .
+
+    *   Sliding Window (SW) [2].
+    *   Time-Frequency (TF) [2].
+
+*   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states .
+
+    *   Co-Activation Patterns (CAP) [2].
+    *   Clustering (SWC) [2].
+    *   Continuous Hidden Markov Models (CHMM) [2].
+    *   Discrete Hidden Markov Models (DHMM) [2].
+    *   Windowless (WL) [2].
 
 Analysis Pipeline: Script-Based Workflow
 ---------------------------------------

From e609759ebf2ff2efca0dee4b1d9b2223afb96173 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 14 Nov 2025 12:31:49 -0500
Subject: [PATCH 280/401] minor

---
 task_dFC/README.rst | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index cba50ef..9b5a338 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -17,17 +17,16 @@ The module supports a diverse selection of seven well-established dFC methodolog
 
 *   **State-free Methods:** Designed to capture continuous fluctuations in connectivity .
 
-    *   Sliding Window (SW) [2].
-    *   Time-Frequency (TF) [2].
+    *   Sliding Window (SW).
+    *   Time-Frequency (TF).
 
 *   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states .
 
-    *   Co-Activation Patterns (CAP) [2].
-    *   Clustering (SWC) [2].
-    *   Continuous Hidden Markov Models (CHMM) [2].
-    *   Discrete Hidden Markov Models (DHMM) [2].
-    *   Windowless (WL) [2].
-
+    *   Co-Activation Patterns (CAP).
+    *   Clustering (SWC).
+    *   Continuous Hidden Markov Models (CHMM).
+    *   Discrete Hidden Markov Models (DHMM).
+    *   Windowless (WL).
 Analysis Pipeline: Script-Based Workflow
 ---------------------------------------
 

From 246789457ef91382e6f7626a3b3ee55a291ad94c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 18:23:04 -0500
Subject: [PATCH 281/401] improve sample matrix visual

---
 task_dFC/README.rst                           |   2 +-
 .../helper_functions.py                       | 223 ++++++++++++------
 2 files changed, 146 insertions(+), 79 deletions(-)

diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index 9b5a338..67ab300 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -77,4 +77,4 @@ The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **Function:** Contains scripts for aggregating and comparing results across multiple datasets and paradigms.
 
-**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics (e.g., accuracy distribution) across the 17 datasets and 29 task paradigms analyzed.
+**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics (e.g., accuracy distribution) across datasets and task paradigms analyzed.
diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index fcaa823..fd754d6 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -923,14 +923,13 @@ def figure_dfc_matrices_window_png(
 
 
 def nice_step(n, max_ticks=10):
-    """Return a 'nice' step (1-2-5x10^k) to keep ≤ max_ticks across [1..n]."""
-    if n <= 1:
+    if n <= max_ticks:
         return 1
-    raw = max(1.0, n / max(2, (max_ticks - 1)))
-    exp = np.floor(np.log10(raw))
-    frac = raw / (10**exp)
-    base = 1 if frac <= 1 else 2 if frac <= 2 else 5 if frac <= 5 else 10
-    return int(base * (10**exp))
+    raw = n / max_ticks
+    base = 10 ** int(np.floor(np.log10(raw)))
+    for m in [1, 2, 5, 10]:
+        if raw <= m * base:
+            return int(m * base)
 
 
 def plot_samples_features(
@@ -939,28 +938,33 @@ def plot_samples_features(
     *,
     sample_order="original",  # "original" | "label" | "label+cluster"
     feature_order="original",  # "original" | "tstat"
-    col_order_from_train=None,  # optional np.ndarray (feature indices) to reuse on test
+    col_order_from_train=None,  # np.ndarray feature indices (use train order on test)
     ZSCORE=True,
     V_RANGE=None,
     cmap="coolwarm",
+    # ---- new options ----
+    show_tbar_when_cluster=True,  # show t-stat strip only when sample_order == "label+cluster"
+    tbar_mode="abs_t",  # "abs_t" (recommended), "t", or "none"
+    tbar_cmap="magma",  # sequential for |t|
+    show_class_means=True,  # show rest/task mean strips when clustered
+    means_cmap=None,  # None -> use main cmap
     title=None,
     save_path=None,
     show=True,
 ):
     """
-    X: (n_samples, n_features) matrix (features in columns)
-    y: (n_samples,) binary (0=rest, 1=task)
+    Samples on horizontal axis; features on vertical axis.
 
-    Samples are shown along the horizontal axis (time-like), features along the vertical axis.
-    If feature_order == "tstat", a slim vertical t-stat bar is shown on the LEFT,
-    aligned 1:1 with feature rows (no top t-bar).
+    When sample_order="label+cluster":
+      - Optional left sidebars:
+        * |t| strip for per-feature t-stat magnitude (sequential colormap + its own colorbar)
+        * Rest and Task mean strips (diverging colormap centered at 0; same scale as main heatmap)
     """
     # ---------- prep ----------
     X = np.asarray(X, float)
     y = np.asarray(y)
     n_samples, n_features = X.shape
 
-    # z-score per feature
     Xz = X.copy()
     if ZSCORE:
         mu = Xz.mean(axis=0, keepdims=True)
@@ -968,6 +972,7 @@ def plot_samples_features(
         Xz = (Xz - mu) / sd
 
     # ---------- feature order ----------
+    t_ord = None
     if feature_order == "tstat":
         if col_order_from_train is not None:
             col_order = np.asarray(col_order_from_train, int)
@@ -975,11 +980,10 @@ def plot_samples_features(
             t_ord = t[col_order]
         else:
             t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
-            col_order = np.argsort(-np.abs(t))  # strongest contrast first
+            col_order = np.argsort(-np.abs(t))
             t_ord = t[col_order]
     else:
         col_order = np.arange(n_features)
-        t_ord = None  # no t-stat bar
 
     # ---------- sample order ----------
     if sample_order == "original":
@@ -1011,29 +1015,22 @@ def order_rows(A):
             "sample_order must be one of {'original','label','label+cluster'}"
         )
 
-    # ---------- figure & layout (no top t-bar) ----------
-    # W = max(10, min(24, n_samples / 30))
-    w_min = 12
-    w_max = 24
-    width_per_100 = 0.5  # additional width per 100 samples
+    # ---------- figure & layout ----------
+    w_min, w_max = 12, 24
+    width_per_100 = 0.5
     W = float(np.clip(w_min + (n_samples / 100.0) * width_per_100, w_min, w_max))
     H = max(6, min(16, n_features / 30))
     fig = plt.figure(figsize=(W, H))
 
-    gs = fig.add_gridspec(
-        nrows=2,
-        ncols=1,
-        height_ratios=[1.0, 0.06],  # main heatmap + class strip
-        hspace=0.08,
-    )
+    gs = fig.add_gridspec(nrows=2, ncols=1, height_ratios=[1.0, 0.06], hspace=0.08)
     ax_main = fig.add_subplot(gs[0, 0])
     ax_lab = fig.add_subplot(gs[1, 0])
 
     # --- VRANGE ---
     if V_RANGE is None:
-        Xflat = np.asarray(Xz, float).ravel()
-        lo, hi = np.nanpercentile(Xflat, [5, 95])  # robust to outliers; tweak if needed
-        V_RANGE = max(abs(lo), abs(hi))  # symmetric around 0 (for diverging cmap)
+        flat = np.asarray(Xz, float).ravel()
+        lo, hi = np.nanpercentile(flat, [5, 95])
+        V_RANGE = max(abs(lo), abs(hi))
 
     # ---------- main heatmap ----------
     img = Xz[row_order, :][:, col_order].T  # (features, samples)
@@ -1043,23 +1040,14 @@ def order_rows(A):
     n_features = img.shape[0]
     last_idx = n_features - 1
 
+    # y-ticks: nice round, 1-based labels, no crowding
     if n_features < 10:
-        # every feature: labels 1..n, positions 0..n-1
         labels_1based = np.arange(1, n_features + 1, dtype=int)
     else:
         step = nice_step(n_features, max_ticks=10)
-        # use round multiples of the step
-        labels_1based = list(np.arange(step, n_features + 1, step, dtype=int))
-        # de-dup & sort (in case step == 1)
-        labels_1based = np.unique(labels_1based)
-
-    # convert 1-based labels to 0-based tick positions
+        labels_1based = np.unique(np.arange(step, n_features + 1, step, dtype=int))
     ticks_pos = labels_1based - 1
-
-    # lock y-limits so the last tick isn't clipped
     ax_main.set_ylim(-0.5, last_idx + 0.5)
-
-    # set ticks & labels
     ax_main.set_yticks(ticks_pos)
     ax_main.set_yticklabels([f"{v:d}" for v in labels_1based])
     ax_main.set_ylabel("feature", fontsize=12, fontweight="bold")
@@ -1071,21 +1059,18 @@ def order_rows(A):
         ax_main.axvline(split - 0.5, color="k", lw=1)
 
     # ---------- bottom class strip ----------
-    y_reordered = y[row_order]
+    y_re = y[row_order]
     cmap_lbl = ListedColormap(
         [[0.85, 0.85, 0.85], [0.25, 0.5, 0.9]]
     )  # rest=gray, task=blue
     ax_lab.imshow(
-        y_reordered[None, :], aspect="auto", origin="lower", cmap=cmap_lbl, vmin=0, vmax=1
+        y_re[None, :], aspect="auto", origin="lower", cmap=cmap_lbl, vmin=0, vmax=1
     )
     ax_lab.set_yticks([])
     ax_lab.set_xticks([])
-    # ax_lab.set_title("class", fontsize=11, pad=2)
 
-    # show class labels only when there is label grouping
     if draw_separator:
-        n0 = (y_reordered == 0).sum()
-        n1 = (y_reordered == 1).sum()
+        n0, n1 = (y_re == 0).sum(), (y_re == 1).sum()
         if n0 > 0:
             x0 = (n0 - 1) / 2.0
             ax_lab.annotate(
@@ -1109,43 +1094,125 @@ def order_rows(A):
                 fontweight="bold",
             )
 
-    # --- move the class bar (ax_lab) down a bit ---
-    fig.canvas.draw()  # ensure positions are current
-    lab_box = ax_lab.get_position()  # [x0, y0, width, height] in figure coords
-    down = 0.020  # how much to move down (figure fraction)
-    new_y0 = max(0.01, lab_box.y0 - down)  # keep it inside the figure
-    ax_lab.set_position([lab_box.x0, new_y0, lab_box.width, lab_box.height])
-
-    # (re)grab the updated box for the colorbar placement that comes next
+    # move class bar slightly down (keeps labels clear)
+    fig.canvas.draw()
     lab_box = ax_lab.get_position()
+    ax_lab.set_position(
+        [lab_box.x0, max(0.01, lab_box.y0 - 0.020), lab_box.width, lab_box.height]
+    )
 
-    # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
-    if t_ord is not None:
-        fig.canvas.draw()
-        main_box = ax_main.get_position()  # figure coords
-
-        tbar_left_width = 0.010  # ~2% fig width
-        tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
-
-        x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
-        y0 = main_box.y0
-        w = tbar_left_width
-        h = main_box.height
-
-        ax_tleft = fig.add_axes([x0, y0, w, h])
-        m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
-        ax_tleft.imshow(
-            t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
-        )
-        ax_tleft.set_xticks([])
-        ax_tleft.set_yticks([])
-        ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
+    # ---------- LEFT sidebars (only when clustered) ----------
+    if sample_order == "label+cluster":
+        # compute per-feature means on z-scored features (columns=features)
+        mu0 = (Xz[y == 0].mean(axis=0) if (y == 0).any() else np.zeros(n_features))[
+            col_order
+        ]
+        mu1 = (Xz[y == 1].mean(axis=0) if (y == 1).any() else np.zeros(n_features))[
+            col_order
+        ]
 
-    # ---------- colorbar (slightly lower so it doesn't overlap class labels) ----------
+        fig.canvas.draw()
+        main_box = ax_main.get_position()
+
+        # geometry
+        pad = 0.012  # gap between strips
+        wbar = 0.012  # width of each strip
+        x_right = main_box.x0 - 0.018  # anchor to left of main heatmap
+
+        # ---- (A) |t| strip ----
+        if (tbar_mode != "none") and (t_ord is not None) and show_tbar_when_cluster:
+            if tbar_mode == "abs_t":
+                tshow = np.abs(t_ord)
+                ttitle = "|t| (task−rest)"
+                tnorm_min, tnorm_max = 0.0, float(np.nanmax(tshow)) or 1.0
+                t_cmap = tbar_cmap
+            else:
+                tshow = t_ord
+                ttitle = "t (task−rest)"
+                m = float(np.nanmax(np.abs(tshow))) or 1.0
+                tnorm_min, tnorm_max = -m, m
+                t_cmap = "coolwarm"
+
+            ax_t = fig.add_axes(
+                [x_right - (wbar + pad), main_box.y0, wbar, main_box.height]
+            )
+            ax_t.imshow(
+                tshow[:, None],
+                origin="lower",
+                aspect="auto",
+                cmap=t_cmap,
+                vmin=tnorm_min,
+                vmax=tnorm_max,
+            )
+            ax_t.set_xticks([])
+            ax_t.set_yticks([])
+            # vertical title + frame so it can’t be confused with the main colorbar
+            ax_t.set_title(ttitle, fontsize=10, pad=2, fontweight="bold", loc="center")
+            for spine in ax_t.spines.values():
+                spine.set_linewidth(0.8)
+
+            # a tiny vertical colorbar just for |t|
+            cax_t = fig.add_axes(
+                [ax_t.get_position().x0, max(0.01, main_box.y0 - 0.06), wbar, 0.05]
+            )
+            cb_t = plt.colorbar(
+                plt.cm.ScalarMappable(), cax=cax_t, orientation="horizontal"
+            )
+            cb_t.remove()  # replace with proper mappable:
+            sm_t = plt.cm.ScalarMappable(cmap=t_cmap)
+            sm_t.set_array([])
+            sm_t.set_clim(tnorm_min, tnorm_max)
+            cb_t = plt.colorbar(sm_t, cax=cax_t, orientation="horizontal")
+            cb_t.set_label("|t|" if tbar_mode == "abs_t" else "t", fontsize=9)
+            cb_t.ax.tick_params(labelsize=8)
+
+        # ---- (B) class mean strips (rest/task) ----
+        if show_class_means:
+            means_cmap = means_cmap or cmap  # default: same as main heatmap
+            # use same color limits as main img so colors are comparable
+            ax_m0 = fig.add_axes([x_right, main_box.y0, wbar, main_box.height])
+            ax_m0.imshow(
+                mu0[:, None],
+                origin="lower",
+                aspect="auto",
+                cmap=means_cmap,
+                vmin=-V_RANGE,
+                vmax=V_RANGE,
+            )
+            ax_m0.set_xticks([])
+            ax_m0.set_yticks([])
+            ax_m0.set_title("mean\nrest", fontsize=9, pad=0, fontweight="bold")
+            for spine in ax_m0.spines.values():
+                spine.set_linewidth(0.8)
+
+            ax_m1 = fig.add_axes(
+                [x_right + (wbar + pad), main_box.y0, wbar, main_box.height]
+            )
+            ax_m1.imshow(
+                mu1[:, None],
+                origin="lower",
+                aspect="auto",
+                cmap=means_cmap,
+                vmin=-V_RANGE,
+                vmax=V_RANGE,
+            )
+            ax_m1.set_xticks([])
+            ax_m1.set_yticks([])
+            ax_m1.set_title("mean\ntask", fontsize=9, pad=0, fontweight="bold")
+            for spine in ax_m1.spines.values():
+                spine.set_linewidth(0.8)
+
+            # zero line reference (thin) – helps interpret sign
+            # draw in data coords of the strips (y from 0..n_features-1)
+            for axi in (ax_m0, ax_m1):
+                axi.axhline(-0.5, color="k", lw=0.0)  # keep limits stable
+                # (we keep the zero reference conceptual; adding a visible long line can clutter)
+
+    # ---------- main colorbar (moved slightly lower to avoid class labels) ----------
     fig.canvas.draw()
     lab_box = ax_lab.get_position()
     cbar_h = 0.02
-    cbar_y = max(0.01, lab_box.y0 - 0.085)  # you liked 0.085
+    cbar_y = max(0.01, lab_box.y0 - 0.085)  # your preferred offset
     cax = fig.add_axes([0.12, cbar_y, 0.30, cbar_h])
     cb = plt.colorbar(im, cax=cax, orientation="horizontal")
     cb.set_label("z-scored feature value", fontsize=11, fontweight="bold")

From 6c9fb9cb607894140a07fc3905846fde69699d6c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 18:25:07 -0500
Subject: [PATCH 282/401] minor

---
 task_dFC/multi_dataset_analysis/sample_matrix_visualization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 2f5797f..28bd57d 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -67,7 +67,7 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
-    for dataset in DATASETS:
+    for dataset in ["ds004848"]:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
         roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
         dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"

From 42d016448efab4781ca4e0705237c71aea35b7c5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 18:35:49 -0500
Subject: [PATCH 283/401] fix minor

---
 .../helper_functions.py                       | 84 +++++++++++++------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index fd754d6..bbc5f08 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1119,14 +1119,33 @@ def order_rows(A):
         wbar = 0.012  # width of each strip
         x_right = main_box.x0 - 0.018  # anchor to left of main heatmap
 
+        # ---------- LEFT sidebars (only when clustered) ----------
+    if sample_order == "label+cluster":
+        # compute per-feature means on z-scored features (columns=features)
+        mu0 = (Xz[y == 0].mean(axis=0) if (y == 0).any() else np.zeros(n_features))[
+            col_order
+        ]
+        mu1 = (Xz[y == 1].mean(axis=0) if (y == 1).any() else np.zeros(n_features))[
+            col_order
+        ]
+
+        fig.canvas.draw()
+        main_box = ax_main.get_position()
+
+        # geometry
+        pad = 0.012  # gap between strips
+        wbar = 0.012  # width of each strip
+        x_right = main_box.x0 - 0.018  # anchor to left of main heatmap
+
         # ---- (A) |t| strip ----
         if (tbar_mode != "none") and (t_ord is not None) and show_tbar_when_cluster:
             if tbar_mode == "abs_t":
                 tshow = np.abs(t_ord)
                 ttitle = "|t| (task−rest)"
                 tnorm_min, tnorm_max = 0.0, float(np.nanmax(tshow)) or 1.0
-                t_cmap = tbar_cmap
+                t_cmap = tbar_cmap  # e.g., "magma"
             else:
+                # fallback to signed t if you ever toggle it
                 tshow = t_ord
                 ttitle = "t (task−rest)"
                 m = float(np.nanmax(np.abs(tshow))) or 1.0
@@ -1146,25 +1165,22 @@ def order_rows(A):
             )
             ax_t.set_xticks([])
             ax_t.set_yticks([])
-            # vertical title + frame so it can’t be confused with the main colorbar
-            ax_t.set_title(ttitle, fontsize=10, pad=2, fontweight="bold", loc="center")
-            for spine in ax_t.spines.values():
-                spine.set_linewidth(0.8)
 
-            # a tiny vertical colorbar just for |t|
-            cax_t = fig.add_axes(
-                [ax_t.get_position().x0, max(0.01, main_box.y0 - 0.06), wbar, 0.05]
-            )
-            cb_t = plt.colorbar(
-                plt.cm.ScalarMappable(), cax=cax_t, orientation="horizontal"
+            # make it unmistakable: vertical label + a thin frame
+            ax_t.text(
+                0.5,
+                1.01,
+                ttitle,
+                transform=ax_t.transAxes,
+                ha="center",
+                va="bottom",
+                rotation=90,
+                fontsize=9,
+                fontweight="bold",
             )
-            cb_t.remove()  # replace with proper mappable:
-            sm_t = plt.cm.ScalarMappable(cmap=t_cmap)
-            sm_t.set_array([])
-            sm_t.set_clim(tnorm_min, tnorm_max)
-            cb_t = plt.colorbar(sm_t, cax=cax_t, orientation="horizontal")
-            cb_t.set_label("|t|" if tbar_mode == "abs_t" else "t", fontsize=9)
-            cb_t.ax.tick_params(labelsize=8)
+            for spine in ax_t.spines.values():
+                spine.set_linewidth(0.8)
+                spine.set_alpha(0.9)
 
         # ---- (B) class mean strips (rest/task) ----
         if show_class_means:
@@ -1181,7 +1197,16 @@ def order_rows(A):
             )
             ax_m0.set_xticks([])
             ax_m0.set_yticks([])
-            ax_m0.set_title("mean\nrest", fontsize=9, pad=0, fontweight="bold")
+            ax_m0.text(
+                0.5,
+                1.01,
+                "mean\nrest",
+                transform=ax_m0.transAxes,
+                ha="center",
+                va="bottom",
+                fontsize=9,
+                fontweight="bold",
+            )
             for spine in ax_m0.spines.values():
                 spine.set_linewidth(0.8)
 
@@ -1198,15 +1223,24 @@ def order_rows(A):
             )
             ax_m1.set_xticks([])
             ax_m1.set_yticks([])
-            ax_m1.set_title("mean\ntask", fontsize=9, pad=0, fontweight="bold")
+            ax_m1.text(
+                0.5,
+                1.01,
+                "mean\ntask",
+                transform=ax_m1.transAxes,
+                ha="center",
+                va="bottom",
+                fontsize=9,
+                fontweight="bold",
+            )
             for spine in ax_m1.spines.values():
                 spine.set_linewidth(0.8)
 
-            # zero line reference (thin) – helps interpret sign
-            # draw in data coords of the strips (y from 0..n_features-1)
-            for axi in (ax_m0, ax_m1):
-                axi.axhline(-0.5, color="k", lw=0.0)  # keep limits stable
-                # (we keep the zero reference conceptual; adding a visible long line can clutter)
+            # # zero line reference (thin) – helps interpret sign
+            # # draw in data coords of the strips (y from 0..n_features-1)
+            # for axi in (ax_m0, ax_m1):
+            #     axi.axhline(-0.5, color="k", lw=0.0)  # keep limits stable
+            #     # (we keep the zero reference conceptual; adding a visible long line can clutter)
 
     # ---------- main colorbar (moved slightly lower to avoid class labels) ----------
     fig.canvas.draw()

From 8519d715923d8e95f78e7567c8b4d911c2936eb9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 18:50:11 -0500
Subject: [PATCH 284/401] minor

---
 .../helper_functions.py                       | 249 ++++++------------
 .../sample_matrix_visualization.py            |   4 +-
 2 files changed, 76 insertions(+), 177 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index bbc5f08..fcaa823 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -923,13 +923,14 @@ def figure_dfc_matrices_window_png(
 
 
 def nice_step(n, max_ticks=10):
-    if n <= max_ticks:
+    """Return a 'nice' step (1-2-5x10^k) to keep ≤ max_ticks across [1..n]."""
+    if n <= 1:
         return 1
-    raw = n / max_ticks
-    base = 10 ** int(np.floor(np.log10(raw)))
-    for m in [1, 2, 5, 10]:
-        if raw <= m * base:
-            return int(m * base)
+    raw = max(1.0, n / max(2, (max_ticks - 1)))
+    exp = np.floor(np.log10(raw))
+    frac = raw / (10**exp)
+    base = 1 if frac <= 1 else 2 if frac <= 2 else 5 if frac <= 5 else 10
+    return int(base * (10**exp))
 
 
 def plot_samples_features(
@@ -938,33 +939,28 @@ def plot_samples_features(
     *,
     sample_order="original",  # "original" | "label" | "label+cluster"
     feature_order="original",  # "original" | "tstat"
-    col_order_from_train=None,  # np.ndarray feature indices (use train order on test)
+    col_order_from_train=None,  # optional np.ndarray (feature indices) to reuse on test
     ZSCORE=True,
     V_RANGE=None,
     cmap="coolwarm",
-    # ---- new options ----
-    show_tbar_when_cluster=True,  # show t-stat strip only when sample_order == "label+cluster"
-    tbar_mode="abs_t",  # "abs_t" (recommended), "t", or "none"
-    tbar_cmap="magma",  # sequential for |t|
-    show_class_means=True,  # show rest/task mean strips when clustered
-    means_cmap=None,  # None -> use main cmap
     title=None,
     save_path=None,
     show=True,
 ):
     """
-    Samples on horizontal axis; features on vertical axis.
+    X: (n_samples, n_features) matrix (features in columns)
+    y: (n_samples,) binary (0=rest, 1=task)
 
-    When sample_order="label+cluster":
-      - Optional left sidebars:
-        * |t| strip for per-feature t-stat magnitude (sequential colormap + its own colorbar)
-        * Rest and Task mean strips (diverging colormap centered at 0; same scale as main heatmap)
+    Samples are shown along the horizontal axis (time-like), features along the vertical axis.
+    If feature_order == "tstat", a slim vertical t-stat bar is shown on the LEFT,
+    aligned 1:1 with feature rows (no top t-bar).
     """
     # ---------- prep ----------
     X = np.asarray(X, float)
     y = np.asarray(y)
     n_samples, n_features = X.shape
 
+    # z-score per feature
     Xz = X.copy()
     if ZSCORE:
         mu = Xz.mean(axis=0, keepdims=True)
@@ -972,7 +968,6 @@ def plot_samples_features(
         Xz = (Xz - mu) / sd
 
     # ---------- feature order ----------
-    t_ord = None
     if feature_order == "tstat":
         if col_order_from_train is not None:
             col_order = np.asarray(col_order_from_train, int)
@@ -980,10 +975,11 @@ def plot_samples_features(
             t_ord = t[col_order]
         else:
             t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
-            col_order = np.argsort(-np.abs(t))
+            col_order = np.argsort(-np.abs(t))  # strongest contrast first
             t_ord = t[col_order]
     else:
         col_order = np.arange(n_features)
+        t_ord = None  # no t-stat bar
 
     # ---------- sample order ----------
     if sample_order == "original":
@@ -1015,22 +1011,29 @@ def order_rows(A):
             "sample_order must be one of {'original','label','label+cluster'}"
         )
 
-    # ---------- figure & layout ----------
-    w_min, w_max = 12, 24
-    width_per_100 = 0.5
+    # ---------- figure & layout (no top t-bar) ----------
+    # W = max(10, min(24, n_samples / 30))
+    w_min = 12
+    w_max = 24
+    width_per_100 = 0.5  # additional width per 100 samples
     W = float(np.clip(w_min + (n_samples / 100.0) * width_per_100, w_min, w_max))
     H = max(6, min(16, n_features / 30))
     fig = plt.figure(figsize=(W, H))
 
-    gs = fig.add_gridspec(nrows=2, ncols=1, height_ratios=[1.0, 0.06], hspace=0.08)
+    gs = fig.add_gridspec(
+        nrows=2,
+        ncols=1,
+        height_ratios=[1.0, 0.06],  # main heatmap + class strip
+        hspace=0.08,
+    )
     ax_main = fig.add_subplot(gs[0, 0])
     ax_lab = fig.add_subplot(gs[1, 0])
 
     # --- VRANGE ---
     if V_RANGE is None:
-        flat = np.asarray(Xz, float).ravel()
-        lo, hi = np.nanpercentile(flat, [5, 95])
-        V_RANGE = max(abs(lo), abs(hi))
+        Xflat = np.asarray(Xz, float).ravel()
+        lo, hi = np.nanpercentile(Xflat, [5, 95])  # robust to outliers; tweak if needed
+        V_RANGE = max(abs(lo), abs(hi))  # symmetric around 0 (for diverging cmap)
 
     # ---------- main heatmap ----------
     img = Xz[row_order, :][:, col_order].T  # (features, samples)
@@ -1040,14 +1043,23 @@ def order_rows(A):
     n_features = img.shape[0]
     last_idx = n_features - 1
 
-    # y-ticks: nice round, 1-based labels, no crowding
     if n_features < 10:
+        # every feature: labels 1..n, positions 0..n-1
         labels_1based = np.arange(1, n_features + 1, dtype=int)
     else:
         step = nice_step(n_features, max_ticks=10)
-        labels_1based = np.unique(np.arange(step, n_features + 1, step, dtype=int))
+        # use round multiples of the step
+        labels_1based = list(np.arange(step, n_features + 1, step, dtype=int))
+        # de-dup & sort (in case step == 1)
+        labels_1based = np.unique(labels_1based)
+
+    # convert 1-based labels to 0-based tick positions
     ticks_pos = labels_1based - 1
+
+    # lock y-limits so the last tick isn't clipped
     ax_main.set_ylim(-0.5, last_idx + 0.5)
+
+    # set ticks & labels
     ax_main.set_yticks(ticks_pos)
     ax_main.set_yticklabels([f"{v:d}" for v in labels_1based])
     ax_main.set_ylabel("feature", fontsize=12, fontweight="bold")
@@ -1059,18 +1071,21 @@ def order_rows(A):
         ax_main.axvline(split - 0.5, color="k", lw=1)
 
     # ---------- bottom class strip ----------
-    y_re = y[row_order]
+    y_reordered = y[row_order]
     cmap_lbl = ListedColormap(
         [[0.85, 0.85, 0.85], [0.25, 0.5, 0.9]]
     )  # rest=gray, task=blue
     ax_lab.imshow(
-        y_re[None, :], aspect="auto", origin="lower", cmap=cmap_lbl, vmin=0, vmax=1
+        y_reordered[None, :], aspect="auto", origin="lower", cmap=cmap_lbl, vmin=0, vmax=1
     )
     ax_lab.set_yticks([])
     ax_lab.set_xticks([])
+    # ax_lab.set_title("class", fontsize=11, pad=2)
 
+    # show class labels only when there is label grouping
     if draw_separator:
-        n0, n1 = (y_re == 0).sum(), (y_re == 1).sum()
+        n0 = (y_reordered == 0).sum()
+        n1 = (y_reordered == 1).sum()
         if n0 > 0:
             x0 = (n0 - 1) / 2.0
             ax_lab.annotate(
@@ -1094,159 +1109,43 @@ def order_rows(A):
                 fontweight="bold",
             )
 
-    # move class bar slightly down (keeps labels clear)
-    fig.canvas.draw()
-    lab_box = ax_lab.get_position()
-    ax_lab.set_position(
-        [lab_box.x0, max(0.01, lab_box.y0 - 0.020), lab_box.width, lab_box.height]
-    )
+    # --- move the class bar (ax_lab) down a bit ---
+    fig.canvas.draw()  # ensure positions are current
+    lab_box = ax_lab.get_position()  # [x0, y0, width, height] in figure coords
+    down = 0.020  # how much to move down (figure fraction)
+    new_y0 = max(0.01, lab_box.y0 - down)  # keep it inside the figure
+    ax_lab.set_position([lab_box.x0, new_y0, lab_box.width, lab_box.height])
 
-    # ---------- LEFT sidebars (only when clustered) ----------
-    if sample_order == "label+cluster":
-        # compute per-feature means on z-scored features (columns=features)
-        mu0 = (Xz[y == 0].mean(axis=0) if (y == 0).any() else np.zeros(n_features))[
-            col_order
-        ]
-        mu1 = (Xz[y == 1].mean(axis=0) if (y == 1).any() else np.zeros(n_features))[
-            col_order
-        ]
+    # (re)grab the updated box for the colorbar placement that comes next
+    lab_box = ax_lab.get_position()
 
+    # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
+    if t_ord is not None:
         fig.canvas.draw()
-        main_box = ax_main.get_position()
-
-        # geometry
-        pad = 0.012  # gap between strips
-        wbar = 0.012  # width of each strip
-        x_right = main_box.x0 - 0.018  # anchor to left of main heatmap
-
-        # ---------- LEFT sidebars (only when clustered) ----------
-    if sample_order == "label+cluster":
-        # compute per-feature means on z-scored features (columns=features)
-        mu0 = (Xz[y == 0].mean(axis=0) if (y == 0).any() else np.zeros(n_features))[
-            col_order
-        ]
-        mu1 = (Xz[y == 1].mean(axis=0) if (y == 1).any() else np.zeros(n_features))[
-            col_order
-        ]
+        main_box = ax_main.get_position()  # figure coords
 
-        fig.canvas.draw()
-        main_box = ax_main.get_position()
-
-        # geometry
-        pad = 0.012  # gap between strips
-        wbar = 0.012  # width of each strip
-        x_right = main_box.x0 - 0.018  # anchor to left of main heatmap
-
-        # ---- (A) |t| strip ----
-        if (tbar_mode != "none") and (t_ord is not None) and show_tbar_when_cluster:
-            if tbar_mode == "abs_t":
-                tshow = np.abs(t_ord)
-                ttitle = "|t| (task−rest)"
-                tnorm_min, tnorm_max = 0.0, float(np.nanmax(tshow)) or 1.0
-                t_cmap = tbar_cmap  # e.g., "magma"
-            else:
-                # fallback to signed t if you ever toggle it
-                tshow = t_ord
-                ttitle = "t (task−rest)"
-                m = float(np.nanmax(np.abs(tshow))) or 1.0
-                tnorm_min, tnorm_max = -m, m
-                t_cmap = "coolwarm"
-
-            ax_t = fig.add_axes(
-                [x_right - (wbar + pad), main_box.y0, wbar, main_box.height]
-            )
-            ax_t.imshow(
-                tshow[:, None],
-                origin="lower",
-                aspect="auto",
-                cmap=t_cmap,
-                vmin=tnorm_min,
-                vmax=tnorm_max,
-            )
-            ax_t.set_xticks([])
-            ax_t.set_yticks([])
-
-            # make it unmistakable: vertical label + a thin frame
-            ax_t.text(
-                0.5,
-                1.01,
-                ttitle,
-                transform=ax_t.transAxes,
-                ha="center",
-                va="bottom",
-                rotation=90,
-                fontsize=9,
-                fontweight="bold",
-            )
-            for spine in ax_t.spines.values():
-                spine.set_linewidth(0.8)
-                spine.set_alpha(0.9)
-
-        # ---- (B) class mean strips (rest/task) ----
-        if show_class_means:
-            means_cmap = means_cmap or cmap  # default: same as main heatmap
-            # use same color limits as main img so colors are comparable
-            ax_m0 = fig.add_axes([x_right, main_box.y0, wbar, main_box.height])
-            ax_m0.imshow(
-                mu0[:, None],
-                origin="lower",
-                aspect="auto",
-                cmap=means_cmap,
-                vmin=-V_RANGE,
-                vmax=V_RANGE,
-            )
-            ax_m0.set_xticks([])
-            ax_m0.set_yticks([])
-            ax_m0.text(
-                0.5,
-                1.01,
-                "mean\nrest",
-                transform=ax_m0.transAxes,
-                ha="center",
-                va="bottom",
-                fontsize=9,
-                fontweight="bold",
-            )
-            for spine in ax_m0.spines.values():
-                spine.set_linewidth(0.8)
+        tbar_left_width = 0.010  # ~2% fig width
+        tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
 
-            ax_m1 = fig.add_axes(
-                [x_right + (wbar + pad), main_box.y0, wbar, main_box.height]
-            )
-            ax_m1.imshow(
-                mu1[:, None],
-                origin="lower",
-                aspect="auto",
-                cmap=means_cmap,
-                vmin=-V_RANGE,
-                vmax=V_RANGE,
-            )
-            ax_m1.set_xticks([])
-            ax_m1.set_yticks([])
-            ax_m1.text(
-                0.5,
-                1.01,
-                "mean\ntask",
-                transform=ax_m1.transAxes,
-                ha="center",
-                va="bottom",
-                fontsize=9,
-                fontweight="bold",
-            )
-            for spine in ax_m1.spines.values():
-                spine.set_linewidth(0.8)
+        x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
+        y0 = main_box.y0
+        w = tbar_left_width
+        h = main_box.height
 
-            # # zero line reference (thin) – helps interpret sign
-            # # draw in data coords of the strips (y from 0..n_features-1)
-            # for axi in (ax_m0, ax_m1):
-            #     axi.axhline(-0.5, color="k", lw=0.0)  # keep limits stable
-            #     # (we keep the zero reference conceptual; adding a visible long line can clutter)
+        ax_tleft = fig.add_axes([x0, y0, w, h])
+        m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
+        ax_tleft.imshow(
+            t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
+        )
+        ax_tleft.set_xticks([])
+        ax_tleft.set_yticks([])
+        ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
 
-    # ---------- main colorbar (moved slightly lower to avoid class labels) ----------
+    # ---------- colorbar (slightly lower so it doesn't overlap class labels) ----------
     fig.canvas.draw()
     lab_box = ax_lab.get_position()
     cbar_h = 0.02
-    cbar_y = max(0.01, lab_box.y0 - 0.085)  # your preferred offset
+    cbar_y = max(0.01, lab_box.y0 - 0.085)  # you liked 0.085
     cax = fig.add_axes([0.12, cbar_y, 0.30, cbar_h])
     cb = plt.colorbar(im, cax=cax, orientation="horizontal")
     cb.set_label("z-scored feature value", fontsize=11, fontweight="bold")
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 28bd57d..b82b58d 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -271,7 +271,7 @@
                             X,
                             y,
                             sample_order="label+cluster",
-                            feature_order="tstat",
+                            feature_order="original",
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,
                         )
@@ -281,7 +281,7 @@
                             X,
                             y,
                             sample_order="label+cluster",  # clustering is per-split; that’s fine
-                            feature_order="tstat",  # we still show the t-bar for reference
+                            feature_order="original",  # we still show the t-bar for reference
                             col_order_from_train=orders["col_order"],
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,

From 7169f8b76707af1fa72ddcccd78033ecfb189596 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 19:08:20 -0500
Subject: [PATCH 285/401] minor

---
 .../helper_functions.py                       | 48 +++++++++----------
 .../sample_matrix_visualization.py            |  4 +-
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index fcaa823..2ad81b6 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -972,14 +972,14 @@ def plot_samples_features(
         if col_order_from_train is not None:
             col_order = np.asarray(col_order_from_train, int)
             t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
-            t_ord = t[col_order]
+            # t_ord = t[col_order]
         else:
             t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
             col_order = np.argsort(-np.abs(t))  # strongest contrast first
-            t_ord = t[col_order]
+            # t_ord = t[col_order]
     else:
         col_order = np.arange(n_features)
-        t_ord = None  # no t-stat bar
+        # t_ord = None  # no t-stat bar
 
     # ---------- sample order ----------
     if sample_order == "original":
@@ -1119,27 +1119,27 @@ def order_rows(A):
     # (re)grab the updated box for the colorbar placement that comes next
     lab_box = ax_lab.get_position()
 
-    # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
-    if t_ord is not None:
-        fig.canvas.draw()
-        main_box = ax_main.get_position()  # figure coords
-
-        tbar_left_width = 0.010  # ~2% fig width
-        tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
-
-        x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
-        y0 = main_box.y0
-        w = tbar_left_width
-        h = main_box.height
-
-        ax_tleft = fig.add_axes([x0, y0, w, h])
-        m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
-        ax_tleft.imshow(
-            t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
-        )
-        ax_tleft.set_xticks([])
-        ax_tleft.set_yticks([])
-        ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
+    # # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
+    # if t_ord is not None:
+    #     fig.canvas.draw()
+    #     main_box = ax_main.get_position()  # figure coords
+
+    #     tbar_left_width = 0.010  # ~2% fig width
+    #     tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
+
+    #     x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
+    #     y0 = main_box.y0
+    #     w = tbar_left_width
+    #     h = main_box.height
+
+    #     ax_tleft = fig.add_axes([x0, y0, w, h])
+    #     m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
+    #     ax_tleft.imshow(
+    #         t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
+    #     )
+    #     ax_tleft.set_xticks([])
+    #     ax_tleft.set_yticks([])
+    #     ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
 
     # ---------- colorbar (slightly lower so it doesn't overlap class labels) ----------
     fig.canvas.draw()
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index b82b58d..28bd57d 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -271,7 +271,7 @@
                             X,
                             y,
                             sample_order="label+cluster",
-                            feature_order="original",
+                            feature_order="tstat",
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,
                         )
@@ -281,7 +281,7 @@
                             X,
                             y,
                             sample_order="label+cluster",  # clustering is per-split; that’s fine
-                            feature_order="original",  # we still show the t-bar for reference
+                            feature_order="tstat",  # we still show the t-bar for reference
                             col_order_from_train=orders["col_order"],
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,

From 8ac153e03ca74800cc8707424613fbc75d6dd5e4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 19:40:04 -0500
Subject: [PATCH 286/401] minor

---
 .../helper_functions.py                       | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 2ad81b6..fcaa823 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -972,14 +972,14 @@ def plot_samples_features(
         if col_order_from_train is not None:
             col_order = np.asarray(col_order_from_train, int)
             t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
-            # t_ord = t[col_order]
+            t_ord = t[col_order]
         else:
             t, _ = ttest_ind(Xz[y == 1], Xz[y == 0], axis=0, equal_var=False)
             col_order = np.argsort(-np.abs(t))  # strongest contrast first
-            # t_ord = t[col_order]
+            t_ord = t[col_order]
     else:
         col_order = np.arange(n_features)
-        # t_ord = None  # no t-stat bar
+        t_ord = None  # no t-stat bar
 
     # ---------- sample order ----------
     if sample_order == "original":
@@ -1119,27 +1119,27 @@ def order_rows(A):
     # (re)grab the updated box for the colorbar placement that comes next
     lab_box = ax_lab.get_position()
 
-    # # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
-    # if t_ord is not None:
-    #     fig.canvas.draw()
-    #     main_box = ax_main.get_position()  # figure coords
-
-    #     tbar_left_width = 0.010  # ~2% fig width
-    #     tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
-
-    #     x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
-    #     y0 = main_box.y0
-    #     w = tbar_left_width
-    #     h = main_box.height
-
-    #     ax_tleft = fig.add_axes([x0, y0, w, h])
-    #     m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
-    #     ax_tleft.imshow(
-    #         t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
-    #     )
-    #     ax_tleft.set_xticks([])
-    #     ax_tleft.set_yticks([])
-    #     ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
+    # ---------- LEFT vertical t-stat bar (only if feature_order=="tstat") ----------
+    if t_ord is not None:
+        fig.canvas.draw()
+        main_box = ax_main.get_position()  # figure coords
+
+        tbar_left_width = 0.010  # ~2% fig width
+        tbar_left_pad = 0.035 / W * 24  # gap from main heatmap, proportional to fig width
+
+        x0 = max(0.01, main_box.x0 - tbar_left_pad - tbar_left_width)
+        y0 = main_box.y0
+        w = tbar_left_width
+        h = main_box.height
+
+        ax_tleft = fig.add_axes([x0, y0, w, h])
+        m = np.nanmax(np.abs(t_ord)) if np.isfinite(t_ord).any() else 1.0
+        ax_tleft.imshow(
+            t_ord[:, None], origin="lower", aspect="auto", cmap=cmap, vmin=-m, vmax=m
+        )
+        ax_tleft.set_xticks([])
+        ax_tleft.set_yticks([])
+        ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
 
     # ---------- colorbar (slightly lower so it doesn't overlap class labels) ----------
     fig.canvas.draw()

From f2119f993bacb59435e03cba6e0875dc19ff9183 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 20:15:17 -0500
Subject: [PATCH 287/401] minor

---
 .../multi_dataset_analysis/sample_matrix_visualization.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 28bd57d..b82b58d 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -271,7 +271,7 @@
                             X,
                             y,
                             sample_order="label+cluster",
-                            feature_order="tstat",
+                            feature_order="original",
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,
                         )
@@ -281,7 +281,7 @@
                             X,
                             y,
                             sample_order="label+cluster",  # clustering is per-split; that’s fine
-                            feature_order="tstat",  # we still show the t-bar for reference
+                            feature_order="original",  # we still show the t-bar for reference
                             col_order_from_train=orders["col_order"],
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,

From 756b04d046af4bdfa8f4348760ccc876b12e3895 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 21:14:31 -0500
Subject: [PATCH 288/401] modify fonts

---
 .../multi_dataset_analysis/helper_functions.py  | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index fcaa823..1eba465 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1062,10 +1062,11 @@ def order_rows(A):
     # set ticks & labels
     ax_main.set_yticks(ticks_pos)
     ax_main.set_yticklabels([f"{v:d}" for v in labels_1based])
-    ax_main.set_ylabel("feature", fontsize=12, fontweight="bold")
-    ax_main.set_xlabel("sample", fontsize=12, fontweight="bold")
-    ax_main.set_xticks([])
-    ax_main.tick_params(axis="y", labelsize=10)
+    ax_main.set_ylabel("feature", fontsize=18, fontweight="bold")
+    ax_main.set_xlabel("sample", fontsize=18, fontweight="bold")
+    # ax_main.set_xticks([])
+    ax_main.tick_params(axis="y", labelsize=18)
+    ax_main.tick_params(axis="x", labelsize=18)
 
     if draw_separator and 0 < split < n_samples:
         ax_main.axvline(split - 0.5, color="k", lw=1)
@@ -1094,7 +1095,7 @@ def order_rows(A):
                 xycoords=("data", "axes fraction"),
                 ha="center",
                 va="top",
-                fontsize=11,
+                fontsize=18,
                 fontweight="bold",
             )
         if n1 > 0:
@@ -1105,7 +1106,7 @@ def order_rows(A):
                 xycoords=("data", "axes fraction"),
                 ha="center",
                 va="top",
-                fontsize=11,
+                fontsize=18,
                 fontweight="bold",
             )
 
@@ -1148,8 +1149,8 @@ def order_rows(A):
     cbar_y = max(0.01, lab_box.y0 - 0.085)  # you liked 0.085
     cax = fig.add_axes([0.12, cbar_y, 0.30, cbar_h])
     cb = plt.colorbar(im, cax=cax, orientation="horizontal")
-    cb.set_label("z-scored feature value", fontsize=11, fontweight="bold")
-    cb.ax.tick_params(labelsize=10)
+    cb.set_label("z-scored feature value", fontsize=18, fontweight="bold")
+    cb.ax.tick_params(labelsize=18)
 
     if title:
         fig.suptitle(title, y=0.995, fontsize=12, fontweight="bold")

From 29ddaf53b6da300281602eabf6aec4cebc342be3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 21:23:39 -0500
Subject: [PATCH 289/401] minor

---
 task_dFC/multi_dataset_analysis/helper_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 1eba465..0b6e9d1 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1113,7 +1113,7 @@ def order_rows(A):
     # --- move the class bar (ax_lab) down a bit ---
     fig.canvas.draw()  # ensure positions are current
     lab_box = ax_lab.get_position()  # [x0, y0, width, height] in figure coords
-    down = 0.020  # how much to move down (figure fraction)
+    down = 0.070  # how much to move down (figure fraction)
     new_y0 = max(0.01, lab_box.y0 - down)  # keep it inside the figure
     ax_lab.set_position([lab_box.x0, new_y0, lab_box.width, lab_box.height])
 

From af15727c87b22ae7ffa1ee26a86c92006b14b29a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 23:13:31 -0500
Subject: [PATCH 290/401] improve figures

---
 .../helper_functions.py                       | 110 ++++++++++++------
 .../sample_matrix_visualization.py            |   9 ++
 2 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 0b6e9d1..6ff8874 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -23,11 +23,11 @@ def setup_pub_style():
     mpl.rcParams.update(
         {
             # Fonts & text
-            "font.size": 10,  # base
+            "font.size": 18,  # base
             "axes.titlesize": 12,
             "axes.labelsize": 11,
-            "xtick.labelsize": 9,
-            "ytick.labelsize": 9,
+            "xtick.labelsize": 18,
+            "ytick.labelsize": 18,
             "legend.fontsize": 9,
             "figure.titlesize": 13,
             "axes.titlepad": 8,
@@ -167,9 +167,9 @@ def get_cog_domain_info(simul_or_real: str):
         # --- Categories of simulated task paradigms ---
         DOMAIN_ORDER = [
             "Simulated Periodic",
-            "Good Paradigm Design, Strong Performance on Real Data",
-            "Good Paradigm Design, Poor Performance on Real Data",
-            "Poor Paradigm Design, Poor Performance on Real Data",
+            "Optimal Paradigm Design, Strong Performance on Real Data",
+            "Optimal Paradigm Design, Limited Performance on Real Data",
+            "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
         ]
         # --- Map task codes -> category ---
         TASK2DOMAIN = {
@@ -177,26 +177,26 @@ def get_cog_domain_info(simul_or_real: str):
             "lowfreqlongrest": "Simulated Periodic",
             "lowfreqshortrest": "Simulated Periodic",
             "lowfreqshorttask": "Simulated Periodic",
-            # Good Paradigm Design, Strong Performance on Real Data
-            "axcpt": "Good Paradigm Design, Strong Performance on Real Data",
-            "stern": "Good Paradigm Design, Strong Performance on Real Data",
-            "cuedts": "Good Paradigm Design, Strong Performance on Real Data",
-            # Good Paradigm Design, Poor Performance on Real Data
-            "execution": "Good Paradigm Design, Poor Performance on Real Data",
-            "imagery": "Good Paradigm Design, Poor Performance on Real Data",
-            "localizer": "Good Paradigm Design, Poor Performance on Real Data",
-            "ppalocalizer": "Good Paradigm Design, Poor Performance on Real Data",
-            # Poor Paradigm Design, Poor Performance on Real Data
-            "itc": "Poor Paradigm Design, Poor Performance on Real Data",
-            "stroop": "Poor Paradigm Design, Poor Performance on Real Data",
-            "risk": "Poor Paradigm Design, Poor Performance on Real Data",
+            # Optimal Paradigm Design, Strong Performance on Real Data
+            "axcpt": "Optimal Paradigm Design, Strong Performance on Real Data",
+            "stern": "Optimal Paradigm Design, Strong Performance on Real Data",
+            "cuedts": "Optimal Paradigm Design, Strong Performance on Real Data",
+            # Optimal Paradigm Design, Limited Performance on Real Data
+            "execution": "Optimal Paradigm Design, Limited Performance on Real Data",
+            "imagery": "Optimal Paradigm Design, Limited Performance on Real Data",
+            "localizer": "Optimal Paradigm Design, Limited Performance on Real Data",
+            "ppalocalizer": "Optimal Paradigm Design, Limited Performance on Real Data",
+            # Sub-Optimal Paradigm Design, Limited Performance on Real Data
+            "itc": "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
+            "stroop": "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
+            "risk": "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
         }
         # base colors per domain (distinct, colorblind-friendly)
         DOMAIN_BASE = {
             "Simulated Periodic": "#1f77b4",
-            "Good Paradigm Design, Strong Performance on Real Data": "#ff7f0e",
-            "Good Paradigm Design, Poor Performance on Real Data": "#02833E",
-            "Poor Paradigm Design, Poor Performance on Real Data": "#d62728",
+            "Optimal Paradigm Design, Strong Performance on Real Data": "#ff7f0e",
+            "Optimal Paradigm Design, Limited Performance on Real Data": "#02833E",
+            "Sub-Optimal Paradigm Design, Limited Performance on Real Data": "#d62728",
         }
     else:
         raise ValueError(f"Invalid simul_or_real: {simul_or_real}")
@@ -1037,7 +1037,7 @@ def order_rows(A):
 
     # ---------- main heatmap ----------
     img = Xz[row_order, :][:, col_order].T  # (features, samples)
-    im = ax_main.imshow(
+    ax_main.imshow(
         img, aspect="auto", origin="lower", cmap=cmap, vmin=-V_RANGE, vmax=V_RANGE
     )
     n_features = img.shape[0]
@@ -1063,13 +1063,13 @@ def order_rows(A):
     ax_main.set_yticks(ticks_pos)
     ax_main.set_yticklabels([f"{v:d}" for v in labels_1based])
     ax_main.set_ylabel("feature", fontsize=18, fontweight="bold")
-    ax_main.set_xlabel("sample", fontsize=18, fontweight="bold")
+    # ax_main.set_xlabel("sample", fontsize=18, fontweight="bold")
     # ax_main.set_xticks([])
     ax_main.tick_params(axis="y", labelsize=18)
     ax_main.tick_params(axis="x", labelsize=18)
 
     if draw_separator and 0 < split < n_samples:
-        ax_main.axvline(split - 0.5, color="k", lw=1)
+        ax_main.axvline(split - 0.5, color="k", lw=2)
 
     # ---------- bottom class strip ----------
     y_reordered = y[row_order]
@@ -1113,10 +1113,18 @@ def order_rows(A):
     # --- move the class bar (ax_lab) down a bit ---
     fig.canvas.draw()  # ensure positions are current
     lab_box = ax_lab.get_position()  # [x0, y0, width, height] in figure coords
-    down = 0.070  # how much to move down (figure fraction)
+    down = 0.050  # how much to move down (figure fraction)
     new_y0 = max(0.01, lab_box.y0 - down)  # keep it inside the figure
     ax_lab.set_position([lab_box.x0, new_y0, lab_box.width, lab_box.height])
 
+    # after you position ax_lab (i.e., after ax_lab.set_position([...]))
+    ax_lab.xaxis.set_label_position("top")
+    ax_lab.set_xlabel("sample", labelpad=6, fontweight="bold", fontsize=18)
+    # keep the strip clean
+    ax_lab.tick_params(
+        axis="x", which="both", length=0, labelbottom=False, labeltop=False
+    )
+
     # (re)grab the updated box for the colorbar placement that comes next
     lab_box = ax_lab.get_position()
 
@@ -1142,16 +1150,6 @@ def order_rows(A):
         ax_tleft.set_yticks([])
         ax_tleft.set_title("t-stat", fontsize=11, pad=2, fontweight="bold")
 
-    # ---------- colorbar (slightly lower so it doesn't overlap class labels) ----------
-    fig.canvas.draw()
-    lab_box = ax_lab.get_position()
-    cbar_h = 0.02
-    cbar_y = max(0.01, lab_box.y0 - 0.085)  # you liked 0.085
-    cax = fig.add_axes([0.12, cbar_y, 0.30, cbar_h])
-    cb = plt.colorbar(im, cax=cax, orientation="horizontal")
-    cb.set_label("z-scored feature value", fontsize=18, fontweight="bold")
-    cb.ax.tick_params(labelsize=18)
-
     if title:
         fig.suptitle(title, y=0.995, fontsize=12, fontweight="bold")
 
@@ -1163,3 +1161,43 @@ def order_rows(A):
         plt.close(fig)
 
     return dict(row_order=row_order, col_order=col_order)
+
+
+def save_scalar_colorbar(
+    cmap="coolwarm",
+    vmin=-2.0,
+    vmax=2.0,  # use the same V_RANGE you use in plots
+    label="z-scored feature value",
+    filename="zscore_colorbar.png",
+    orientation="horizontal",
+    figsize=(6, 0.4),  # width, height in inches
+    dpi=300,
+    ticks=None,
+):
+    """
+    Saves a standalone scalar colorbar image you can reuse in the paper.
+    """
+    # Make a dummy mappable with the correct colormap and limits
+    from matplotlib.cm import ScalarMappable
+    from matplotlib.colors import Normalize
+
+    fig = plt.figure(figsize=figsize, dpi=dpi)
+    ax = fig.add_axes(
+        [0.05, 0.35, 0.90, 0.30]
+        if orientation == "horizontal"
+        else [0.35, 0.05, 0.30, 0.90]
+    )
+
+    sm = ScalarMappable(norm=Normalize(vmin=vmin, vmax=vmax), cmap=cmap)
+    sm.set_array([])
+
+    cb = plt.colorbar(sm, cax=ax, orientation=orientation)
+    cb.set_label(label, fontsize=18, fontweight="bold")
+
+    if ticks is not None:
+        cb.set_ticks(ticks)
+        cb.set_ticklabels([str(t) for t in ticks])
+    cb.ax.tick_params(labelsize=18)
+
+    fig.savefig(filename, bbox_inches="tight", pad_inches=0.02)
+    plt.close(fig)
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index b82b58d..ac35d1f 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -15,6 +15,7 @@
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
     plot_samples_features,
+    save_scalar_colorbar,
 )
 
 use_raw_features = False  # if True, use raw dFC features instead of embedded features
@@ -286,3 +287,11 @@
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,
                         )
+
+                    save_scalar_colorbar(
+                        cmap="coolwarm",
+                        vmin=-1.6,
+                        vmax=1.6,  # use the same V_RANGE you use in plots
+                        label="z-scored feature value",
+                        filename=f"{output_root}/zscore_colorbar.png",
+                    )

From 02d075f28c1e4595daf11d388f546bb9fa2f4216 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 23:25:09 -0500
Subject: [PATCH 291/401] minor

---
 task_dFC/multi_dataset_analysis/sample_matrix_visualization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index ac35d1f..12df498 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -18,7 +18,7 @@
     save_scalar_colorbar,
 )
 
-use_raw_features = False  # if True, use raw dFC features instead of embedded features
+use_raw_features = True  # if True, use raw dFC features instead of embedded features
 normalize_dFC = True
 FCS_proba_for_SB = True
 train_test_ratio = 0.8

From c223b2ea0eab54c2b41a111ee75a6ce2b87d0602 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 20 Nov 2025 23:36:25 -0500
Subject: [PATCH 292/401] minor

---
 .../multi_dataset_analysis/sample_matrix_visualization.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 12df498..3e36243 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -18,7 +18,7 @@
     save_scalar_colorbar,
 )
 
-use_raw_features = True  # if True, use raw dFC features instead of embedded features
+use_raw_features = False  # if True, use raw dFC features instead of embedded features
 normalize_dFC = True
 FCS_proba_for_SB = True
 train_test_ratio = 0.8
@@ -68,7 +68,7 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
-    for dataset in ["ds004848"]:
+    for dataset in DATASETS:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
         roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
         dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"

From 67704d997fa780d6642c930cfab93d8f775318bf Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 24 Nov 2025 22:52:24 -0500
Subject: [PATCH 293/401] add periodicity_index and optimality_index

---
 pydfc/task_utils.py                           | 154 +++++++++++++++++-
 .../performance_predict.py                    |  75 +++++++--
 2 files changed, 213 insertions(+), 16 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 0f3c131..25be15a 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -509,7 +509,7 @@ def extract_task_presence(
     return task_presence, indices
 
 
-################################# Task Features ####################################
+################################# Task Design Features ####################################
 
 
 def calc_relative_task_on(task_presence):
@@ -574,3 +574,155 @@ def calc_transition_freq(task_presence):
     num_of_transitions = np.sum(transitions)
     relative_transition_freq = num_of_transitions / len(task_presence)
     return num_of_transitions, relative_transition_freq
+
+
+def noise_model(f, alpha=1.0):
+    # 1/f^alpha normalized to unit median (cheap default)
+    spec = 1.0 / np.maximum(f, 1e-6) ** alpha
+    med = np.median(spec[f > 0])
+    return spec / med
+
+
+def compute_periodicity_index(
+    event_labels,
+    TR_task,
+    fmin=0.0,
+    fmax=None,
+    no_hrf=False,
+    TR_mri=None,
+):
+    """
+    Compute a noise-free periodicity index for a task timing time course.
+
+    Parameters
+    ----------
+    event_labels : array, shape (T,)
+        Event labels time course.
+    TR_task : float
+        Repetition time (seconds).
+    fmin, fmax : float
+        Frequency band (Hz) to consider. If fmax is None, Nyquist is used.
+    no_hrf : bool
+        If True, do not convolve with HRF.
+    TR_mri : float
+        Repetition time of MRI (seconds), required if no_hrf is False.
+
+    Returns
+    -------
+    results : dict
+        {
+          'periodicity_index': float in [0, 1], higher = more periodic,
+          'spectral_entropy': float in [0, 1], lower = more periodic,
+          'peak_freq': float, frequency of dominant peak (Hz),
+          'peak_dominance': float in [0, 1], peak power / total power
+        }
+    """
+    if no_hrf:
+        task_tc = np.multiply(event_labels != 0, 1)
+    else:
+        event_labels_all_task = np.multiply(event_labels != 0, 1)
+        task_tc = event_labels_conv_hrf(
+            event_labels=event_labels_all_task,
+            TR_mri=TR_mri,
+            TR_task=TR_task,
+            no_hrf=False,
+        )
+    task_tc = np.asarray(task_tc)
+    T = len(task_tc)
+
+    # Detrend and mean-center
+    x = task_tc - np.mean(task_tc)
+
+    # FFT
+    freqs = np.fft.rfftfreq(T, d=TR_task)
+    fft_vals = np.fft.rfft(x)
+    power = np.abs(fft_vals) ** 2
+
+    # Restrict frequency range
+    if fmax is None:
+        fmax = 0.5 / TR_task  # Nyquist
+    mask = (freqs >= fmin) & (freqs <= fmax)
+    freqs = freqs[mask]
+    power = power[mask]
+
+    # Avoid division by zero
+    if np.all(power == 0):
+        return {
+            "periodicity_index": 0.0,
+            "spectral_entropy": 1.0,
+            "peak_freq": 0.0,
+            "peak_dominance": 0.0,
+        }
+
+    # Normalize spectrum to probability distribution
+    p = power / power.sum()
+
+    # Spectral entropy (normalized to [0,1])
+    eps = 1e-12
+    H = -(p * np.log(p + eps)).sum() / np.log(len(p))  # in [0,1], higher = more "flat"
+
+    # Dominant peak and its dominance
+    peak_idx = np.argmax(power)
+    peak_freq = freqs[peak_idx]
+    peak_power = power[peak_idx]
+    peak_dominance = peak_power / power.sum()  # 0–1
+
+    # Define periodicity index: high when entropy is low and peak is dominant
+    periodicity_index = (1.0 - H) * peak_dominance
+
+    return {
+        "periodicity_index": float(periodicity_index),
+        "spectral_entropy": float(H),
+        "peak_freq": float(peak_freq),
+        "peak_dominance": float(peak_dominance),
+    }
+
+
+def compute_optimality_index(
+    event_labels, TR_task, TR_mri, fmin=0.0, fmax=None, alpha=1.0
+):
+    """
+    Worsley-style optimality: how well the design energy overlaps HRF^2 / noise.
+    """
+    time_length_HRF = 32.0  # in sec
+    oversampling = TR_mri / TR_task
+
+    task_tc = np.multiply(event_labels != 0, 1)
+    hrf_tc = glm.first_level.spm_hrf(
+        tr=TR_mri, oversampling=oversampling, time_length=time_length_HRF, onset=0.0
+    )
+
+    task_tc = np.asarray(task_tc)
+    hrf_tc = np.asarray(hrf_tc)
+
+    T = len(task_tc)
+    if len(hrf_tc) < T:
+        # zero-pad HRF to length T
+        hrf_tc = np.pad(hrf_tc, (0, T - len(hrf_tc)), mode="constant")
+    else:
+        hrf_tc = hrf_tc[:T]
+
+    freqs = np.fft.rfftfreq(T, d=TR_task)
+    # noise: estimate by 1/f
+    noise_psd = noise_model(freqs, alpha=alpha)
+
+    design_spectrum = np.abs(np.fft.rfft(task_tc)) ** 2
+    hrf_spectrum = np.abs(np.fft.rfft(hrf_tc)) ** 2
+
+    if fmax is None:
+        fmax = 0.5 / TR_task
+    mask = (freqs >= fmin) & (freqs <= fmax)
+    freqs = freqs[mask]
+    design_spectrum = design_spectrum[mask]
+    hrf_spectrum = hrf_spectrum[mask]
+    noise_psd = np.asarray(noise_psd)[mask]
+
+    # avoid division by zero
+    eps = 1e-12
+    snr_weight = hrf_spectrum / (noise_psd + eps)
+
+    # Optimality index ~ ∑ design_power * (HRF^2 / noise)
+    oi = np.sum(design_spectrum * snr_weight)
+
+    # normalize (optional) so it's in [0,1] across paradigms
+    return float(oi)
diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index dbc1989..7f332c1 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -17,6 +17,8 @@
     calc_task_duration,
     calc_transition_freq,
     cohen_d_bold,
+    compute_optimality_index,
+    compute_periodicity_index,
     extract_task_presence,
 )
 
@@ -80,6 +82,8 @@
     transition_freq_all = {}
     rest_durations_all = {}
     task_durations_all = {}
+    PI_all = {}
+    OI_all = {}
     for dataset in DATASETS:
 
         print(f"Processing dataset: {dataset}")
@@ -149,6 +153,21 @@
                         task_durations = calc_task_duration(
                             event_labels, TR_mri=1 / task_data["Fs_task"]
                         )
+                        # Periodicity Index (low entropy => high periodicity)
+                        out = compute_periodicity_index(
+                            event_labels=event_labels,
+                            TR_task=1 / task_data["Fs_task"],
+                            no_hrf=False,
+                            TR_mri=task_data["TR_mri"],
+                        )
+                        PI = out["periodicity_index"]
+
+                        # Optimality Index (how close the task design is to the optimal design)
+                        OI = compute_optimality_index(
+                            event_labels=event_labels,
+                            TR_task=1 / task_data["Fs_task"],
+                            TR_mri=task_data["TR_mri"],
+                        )
 
                         if not task in task_ratio_all:
                             task_ratio_all[task] = []
@@ -158,17 +177,25 @@
                             rest_durations_all[task] = []
                         if not task in task_durations_all:
                             task_durations_all[task] = []
+                        if not task in PI_all:
+                            PI_all[task] = []
+                        if not task in OI_all:
+                            OI_all[task] = []
                         task_ratio_all[task].append(relative_task_on)
                         transition_freq_all[task].append(relative_transition_freq)
                         # rest_durations and task_durations are lists
                         rest_durations_all[task].extend(rest_durations)
                         task_durations_all[task].extend(task_durations)
+                        PI_all[task].append(PI)
+                        OI_all[task].append(OI)
 
     task_design_features = {
         "task_ratio_all": task_ratio_all,
         "transition_freq_all": transition_freq_all,
         "rest_durations_all": rest_durations_all,
         "task_durations_all": task_durations_all,
+        "PI_all": PI_all,
+        "OI_all": OI_all,
     }
 
     CohensD_across_task = {}
@@ -376,19 +403,24 @@
     metric = "SVM balanced accuracy"
     GROUP = "test"
 
-    all_scores = {}
-    for i in range(len(ALL_ML_SCORES["task"])):
-        if (
-            ALL_ML_SCORES["embedding"][i] == embedding
-            and ALL_ML_SCORES["group"][i] == GROUP
-        ):
-
-            if ALL_ML_SCORES["task"][i] not in all_scores:
-                all_scores[ALL_ML_SCORES["task"][i]] = []
-            all_scores[ALL_ML_SCORES["task"][i]].append(ALL_ML_SCORES[metric][i])
+    METHODS = set(ALL_ML_SCORES["dFC method"])
+    all_scores = {method: {} for method in METHODS}
+    for method in METHODS:
+        for i in range(len(ALL_ML_SCORES["task"])):
+            if (
+                ALL_ML_SCORES["embedding"][i] == embedding
+                and ALL_ML_SCORES["group"][i] == GROUP
+                and ALL_ML_SCORES["dFC method"][i] == method
+            ):
+                if ALL_ML_SCORES["task"][i] not in all_scores[method]:
+                    all_scores[method][ALL_ML_SCORES["task"][i]] = []
+                all_scores[method][ALL_ML_SCORES["task"][i]].append(
+                    ALL_ML_SCORES[metric][i]
+                )
 
-    # all_scores is a list of scores across methods and runs
-    all_scores = {k: np.array(v) for k, v in all_scores.items()}
+    # all_scores[<method>][<task>] is a list of scores across runs
+    for method in all_scores:
+        all_scores[method] = {k: np.array(v) for k, v in all_scores[method].items()}
 
     # we have task design features in task_design_features[task_ratio_all][task], task_design_features[transition_freq_all][task], task_design_features[rest_durations_all][task], task_design_features[task_durations_all][task]
     # we have CohensD in CohensD_across_task[task]
@@ -402,8 +434,9 @@
         "task_durations_mean": [],
         "rest_durations_std": [],
         "task_durations_std": [],
+        "PI_mean": [],
+        "OI_mean": [],
         "cohen_d_max": [],
-        "classfication_score_mean": [],
     }
     for task in TASKS_to_include:
         task_ratio = np.mean(task_design_features["task_ratio_all"][task])
@@ -412,8 +445,9 @@
         task_durations_mean = np.mean(task_design_features["task_durations_all"][task])
         rest_durations_std = np.std(task_design_features["rest_durations_all"][task])
         task_durations_std = np.std(task_design_features["task_durations_all"][task])
+        PI_mean = np.mean(PI_all[task])
+        OI_mean = np.mean(OI_all[task])
         cohen_d_max = np.max(np.abs(CohensD_across_task[task]))
-        classfication_score_mean = np.mean(all_scores[task])
 
         DATA["task"].append(task)
         DATA["task_ratio"].append(task_ratio)
@@ -422,8 +456,19 @@
         DATA["task_durations_mean"].append(task_durations_mean)
         DATA["rest_durations_std"].append(rest_durations_std)
         DATA["task_durations_std"].append(task_durations_std)
+        DATA["PI_mean"].append(PI_mean)
+        DATA["OI_mean"].append(OI_mean)
         DATA["cohen_d_max"].append(cohen_d_max)
-        DATA["classfication_score_mean"].append(classfication_score_mean)
+
+        # Also add ML scores
+        for method in all_scores:
+            if f"classfication_score_{method}" not in DATA:
+                DATA[f"classfication_score_{method}"] = []
+            if task in all_scores[method]:
+                score_mean = np.mean(all_scores[method][task])
+            else:
+                score_mean = np.nan
+            DATA[f"classfication_score_{method}"].append(score_mean)
 
     # save DATA
     np.save(f"{output_root}/performance_predictor_data.npy", DATA)

From 3de514ee913e019d196e005a53de709fede09f34 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 25 Nov 2025 22:13:56 -0500
Subject: [PATCH 294/401] minor

---
 pydfc/task_utils.py                                    | 5 +----
 task_dFC/multi_dataset_analysis/performance_predict.py | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 25be15a..d05b669 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -589,7 +589,6 @@ def compute_periodicity_index(
     fmin=0.0,
     fmax=None,
     no_hrf=False,
-    TR_mri=None,
 ):
     """
     Compute a noise-free periodicity index for a task timing time course.
@@ -604,8 +603,6 @@ def compute_periodicity_index(
         Frequency band (Hz) to consider. If fmax is None, Nyquist is used.
     no_hrf : bool
         If True, do not convolve with HRF.
-    TR_mri : float
-        Repetition time of MRI (seconds), required if no_hrf is False.
 
     Returns
     -------
@@ -623,7 +620,7 @@ def compute_periodicity_index(
         event_labels_all_task = np.multiply(event_labels != 0, 1)
         task_tc = event_labels_conv_hrf(
             event_labels=event_labels_all_task,
-            TR_mri=TR_mri,
+            TR_mri=TR_task,
             TR_task=TR_task,
             no_hrf=False,
         )
diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 7f332c1..47d3a0d 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -158,7 +158,6 @@
                             event_labels=event_labels,
                             TR_task=1 / task_data["Fs_task"],
                             no_hrf=False,
-                            TR_mri=task_data["TR_mri"],
                         )
                         PI = out["periodicity_index"]
 

From a9c63b4f5f9843b6867827317c437544a151be83 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 25 Nov 2025 23:07:12 -0500
Subject: [PATCH 295/401] minor

---
 pydfc/task_utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index d05b669..fad5791 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -617,11 +617,13 @@ def compute_periodicity_index(
     if no_hrf:
         task_tc = np.multiply(event_labels != 0, 1)
     else:
-        event_labels_all_task = np.multiply(event_labels != 0, 1)
-        task_tc = event_labels_conv_hrf(
-            event_labels=event_labels_all_task,
-            TR_mri=TR_task,
+        task_tc, _ = extract_task_presence(
+            event_labels=event_labels,
             TR_task=TR_task,
+            TR_mri=TR_task,
+            TR_array=None,
+            binary=False,
+            binarizing_method="GMM",
             no_hrf=False,
         )
     task_tc = np.asarray(task_tc)

From 419ef3f3ebcdc5fe7e6a0392dafe1d07d3a024cc Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 25 Nov 2025 23:58:53 -0500
Subject: [PATCH 296/401] normalize OI

---
 pydfc/task_utils.py                           | 112 +++++++++++++++---
 .../performance_predict.py                    |   3 +-
 2 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index fad5791..0c9016b 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -681,47 +681,121 @@ def compute_optimality_index(
     event_labels, TR_task, TR_mri, fmin=0.0, fmax=None, alpha=1.0
 ):
     """
-    Worsley-style optimality: how well the design energy overlaps HRF^2 / noise.
+    Compute a Worsley-style optimality index (OI) and normalized OI
+    relative to an ideal sinusoidal design at the dominant frequency.
+
+    Returns:
+    --------
+    {
+      "OI": float,
+      "OI_ideal": float,
+      "OI_norm": float,
+      "peak_freq": float
+    }
     """
-    time_length_HRF = 32.0  # in sec
+
+    # -------------------------
+    # 1. Preprocess Task Timing
+    # -------------------------
+    task_tc = np.multiply(event_labels != 0, 1).astype(float).flatten()
+    T = len(task_tc)
+
+    # -------------------------
+    # 2. HRF Model
+    # -------------------------
+    time_length_HRF = 32.0
     oversampling = TR_mri / TR_task
 
-    task_tc = np.multiply(event_labels != 0, 1)
     hrf_tc = glm.first_level.spm_hrf(
         tr=TR_mri, oversampling=oversampling, time_length=time_length_HRF, onset=0.0
     )
-
-    task_tc = np.asarray(task_tc)
     hrf_tc = np.asarray(hrf_tc)
 
-    T = len(task_tc)
+    # Pad or truncate HRF to length T
     if len(hrf_tc) < T:
-        # zero-pad HRF to length T
         hrf_tc = np.pad(hrf_tc, (0, T - len(hrf_tc)), mode="constant")
     else:
         hrf_tc = hrf_tc[:T]
 
+    # -------------------------
+    # 3. Frequency grid + noise PSD
+    # -------------------------
     freqs = np.fft.rfftfreq(T, d=TR_task)
-    # noise: estimate by 1/f
-    noise_psd = noise_model(freqs, alpha=alpha)
 
+    # simple 1/f^alpha noise
+    noise_psd = (freqs + 1e-6) ** (-alpha)
+
+    # -------------------------
+    # 4. FFT-based spectra
+    # -------------------------
     design_spectrum = np.abs(np.fft.rfft(task_tc)) ** 2
     hrf_spectrum = np.abs(np.fft.rfft(hrf_tc)) ** 2
 
+    # -------------------------
+    # 5. Frequency mask
+    # -------------------------
     if fmax is None:
         fmax = 0.5 / TR_task
+
     mask = (freqs >= fmin) & (freqs <= fmax)
-    freqs = freqs[mask]
-    design_spectrum = design_spectrum[mask]
-    hrf_spectrum = hrf_spectrum[mask]
-    noise_psd = np.asarray(noise_psd)[mask]
+    freqs_m = freqs[mask]
+    design_spectrum_m = design_spectrum[mask]
+    hrf_spectrum_m = hrf_spectrum[mask]
+    noise_psd_m = noise_psd[mask]
 
-    # avoid division by zero
     eps = 1e-12
-    snr_weight = hrf_spectrum / (noise_psd + eps)
+    snr_weight = hrf_spectrum_m / (noise_psd_m + eps)
+
+    # -------------------------
+    # 6. ORIGINAL (TASK) OI
+    # -------------------------
+    OI = np.sum(design_spectrum_m * snr_weight)
+
+    # -------------------------
+    # 7. IDEAL OI (sinusoid at peak frequency)
+    # -------------------------
 
-    # Optimality index ~ ∑ design_power * (HRF^2 / noise)
-    oi = np.sum(design_spectrum * snr_weight)
+    # Remove DC by ignoring freq = 0
+    nonzero_mask = freqs_m > 0
+    if not np.any(nonzero_mask):
+        # no nonzero frequencies in the band
+        return {
+            "OI": float(OI),
+            "OI_ideal": 0.0,
+            "OI_norm": 0.0,
+            "peak_freq": 0.0,
+        }
+
+    freqs_nz = freqs_m[nonzero_mask]
+    design_spectrum_nz = design_spectrum_m[nonzero_mask]
+
+    # Find dominant non-DC frequency
+    peak_idx = np.argmax(design_spectrum_nz)
+    peak_freq = freqs_nz[peak_idx]
+
+    # Build ideal sinusoid
+    t = np.arange(T) * TR_task
+    ideal_tc = np.sin(2 * np.pi * peak_freq * t)
+
+    # FFT of ideal design
+    ideal_spectrum = np.abs(np.fft.rfft(ideal_tc)) ** 2
+    ideal_spectrum_m = ideal_spectrum[mask]
 
-    # normalize (optional) so it's in [0,1] across paradigms
-    return float(oi)
+    # Ideal OI
+    OI_ideal = np.sum(ideal_spectrum_m * snr_weight)
+
+    # -------------------------
+    # 8. Normalized OI
+    # -------------------------
+    if OI_ideal < eps:
+        OI_norm = 0.0
+    else:
+        OI_norm = OI / OI_ideal
+        OI_norm = max(0.0, min(1.0, float(OI_norm)))  # clamp to [0,1]
+
+    return {
+        "OI": float(OI),
+        "OI_ideal": float(OI_ideal),
+        "OI_norm": float(OI_norm),
+        "peak_freq": float(peak_freq),
+    }
diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 47d3a0d..5ebf36d 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -162,11 +162,12 @@
                         PI = out["periodicity_index"]
 
                         # Optimality Index (how close the task design is to the optimal design)
-                        OI = compute_optimality_index(
+                        out = compute_optimality_index(
                             event_labels=event_labels,
                             TR_task=1 / task_data["Fs_task"],
                             TR_mri=task_data["TR_mri"],
                         )
+                        OI = out["OI_norm"]
 
                         if not task in task_ratio_all:
                             task_ratio_all[task] = []

From 87435aeb25aaf39bca004f06325b31ac613aae48 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 26 Nov 2025 22:25:29 -0500
Subject: [PATCH 297/401] minor

---
 task_dFC/multi_dataset_analysis/performance_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 5ebf36d..5908aaf 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -471,4 +471,4 @@
             DATA[f"classfication_score_{method}"].append(score_mean)
 
     # save DATA
-    np.save(f"{output_root}/performance_predictor_data.npy", DATA)
+    np.save(f"{output_root}/performance_predictor_data_{simul_or_real}.npy", DATA)

From 8ccbe25d13e4e5fb5874055ff860aac92d7008d1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 26 Nov 2025 22:26:59 -0500
Subject: [PATCH 298/401] minor

---
 task_dFC/multi_dataset_analysis/performance_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 5908aaf..5ebf36d 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -471,4 +471,4 @@
             DATA[f"classfication_score_{method}"].append(score_mean)
 
     # save DATA
-    np.save(f"{output_root}/performance_predictor_data_{simul_or_real}.npy", DATA)
+    np.save(f"{output_root}/performance_predictor_data.npy", DATA)

From 25ef679ae899d11c9fcf3e0579d8c031f6abcec7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 27 Nov 2025 00:43:02 -0500
Subject: [PATCH 299/401] add PAC

---
 pydfc/task_utils.py                           | 72 +++++++++++++++++++
 .../performance_predict.py                    | 14 ++++
 2 files changed, 86 insertions(+)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 0c9016b..d1dfd79 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -13,6 +13,7 @@
 from nilearn import glm
 from scipy import signal
 from sklearn.mixture import GaussianMixture
+from statsmodels.tsa.stattools import acf
 
 from .dfc_utils import TR_intersection, rank_norm, visualize_conn_mat
 
@@ -799,3 +800,74 @@ def compute_optimality_index(
         "OI_norm": float(OI_norm),
         "peak_freq": float(peak_freq),
     }
+
+
+from scipy.ndimage import uniform_filter1d
+from scipy.signal import find_peaks
+
+
+def periodicity_autocorr(event_labels, TR_task, max_lag=None):
+    """
+    Measure how periodic a 0/1 event label time course is using autocorrelation.
+
+    Parameters
+    ----------
+    event_labels : array-like
+        array of 0/1 labels (e.g., rest=0, task=1).
+    TR_task : float
+        Repetition time (seconds).
+    max_lag : int or None
+        Maximum lag to compute autocorrelation. If None, uses len(x)//2.
+
+    Returns
+    -------
+    periodicity : float
+        Strength of the strongest non-zero autocorrelation peak (in [−1, 1]).
+    best_lag : int
+        Lag (in samples) at which this peak occurs.
+    r : np.ndarray
+        Autocorrelation values from lag 0..max_lag.
+    """
+    x, _ = extract_task_presence(
+        event_labels=event_labels,
+        TR_task=TR_task,
+        TR_mri=TR_task,
+        TR_array=None,
+        binary=False,
+        binarizing_method="GMM",
+        no_hrf=False,
+    )
+
+    # Optional: center to remove bias from unbalanced 0/1 ratio
+    x = x - x.mean()
+
+    if max_lag is None:
+        max_lag = len(x) // 2
+
+    # r[0] = 1 by definition
+    r = acf(x, nlags=max_lag, fft=False)
+
+    # Find true peaks (periodic peaks) ---
+    peaks, _ = find_peaks(r)
+
+    if len(peaks) == 0:
+        return {"periodicity": 0.0, "best_lag": None, "r": r}
+
+    # skip lag 0
+    peaks = peaks[peaks > 0]
+
+    if len(peaks) == 0:
+        return {"periodicity": 0.0, "best_lag": None, "r": r}
+
+    best_lag = peaks[np.argmax(r[peaks])]
+
+    # # Ignore lag 0, find strongest positive correlation
+    # r_nonzero = r[1:]
+    # best_lag = np.argmax(r_nonzero) + 1
+    periodicity = r[best_lag]
+
+    return {
+        "periodicity": periodicity,
+        "best_lag": best_lag,
+        "r": r,
+    }
diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 5ebf36d..730fc32 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -84,6 +84,7 @@
     task_durations_all = {}
     PI_all = {}
     OI_all = {}
+    PAC_all = {}
     for dataset in DATASETS:
 
         print(f"Processing dataset: {dataset}")
@@ -169,6 +170,13 @@
                         )
                         OI = out["OI_norm"]
 
+                        # Periodicity via autocorrelation
+                        out = compute_periodicity_index(
+                            event_labels=event_labels,
+                            TR_task=1 / task_data["Fs_task"],
+                        )
+                        PAC = out["periodicity"]
+
                         if not task in task_ratio_all:
                             task_ratio_all[task] = []
                         if not task in transition_freq_all:
@@ -181,6 +189,8 @@
                             PI_all[task] = []
                         if not task in OI_all:
                             OI_all[task] = []
+                        if not task in PAC_all:
+                            PAC_all[task] = []
                         task_ratio_all[task].append(relative_task_on)
                         transition_freq_all[task].append(relative_transition_freq)
                         # rest_durations and task_durations are lists
@@ -188,6 +198,7 @@
                         task_durations_all[task].extend(task_durations)
                         PI_all[task].append(PI)
                         OI_all[task].append(OI)
+                        PAC_all[task].append(PAC)
 
     task_design_features = {
         "task_ratio_all": task_ratio_all,
@@ -196,6 +207,7 @@
         "task_durations_all": task_durations_all,
         "PI_all": PI_all,
         "OI_all": OI_all,
+        "PAC_all": PAC_all,
     }
 
     CohensD_across_task = {}
@@ -447,6 +459,7 @@
         task_durations_std = np.std(task_design_features["task_durations_all"][task])
         PI_mean = np.mean(PI_all[task])
         OI_mean = np.mean(OI_all[task])
+        PAC_mean = np.mean(PAC_all[task])
         cohen_d_max = np.max(np.abs(CohensD_across_task[task]))
 
         DATA["task"].append(task)
@@ -458,6 +471,7 @@
         DATA["task_durations_std"].append(task_durations_std)
         DATA["PI_mean"].append(PI_mean)
         DATA["OI_mean"].append(OI_mean)
+        DATA["PAC_mean"].append(PAC_mean)
         DATA["cohen_d_max"].append(cohen_d_max)
 
         # Also add ML scores

From aa85e13175ae6cd690e5b1c92afc379b2a706954 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 27 Nov 2025 00:55:19 -0500
Subject: [PATCH 300/401] bug fix

---
 task_dFC/multi_dataset_analysis/performance_predict.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 730fc32..dbedde1 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -20,6 +20,7 @@
     compute_optimality_index,
     compute_periodicity_index,
     extract_task_presence,
+    periodicity_autocorr,
 )
 
 fig_bbox_inches = "tight"
@@ -171,7 +172,7 @@
                         OI = out["OI_norm"]
 
                         # Periodicity via autocorrelation
-                        out = compute_periodicity_index(
+                        out = periodicity_autocorr(
                             event_labels=event_labels,
                             TR_task=1 / task_data["Fs_task"],
                         )

From dc72d9c8c3affe20664b7114fff40411a94cbbfc Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 27 Nov 2025 08:36:51 -0500
Subject: [PATCH 301/401] minor

---
 task_dFC/multi_dataset_analysis/performance_predict.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index dbedde1..6db4136 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -449,6 +449,7 @@
         "task_durations_std": [],
         "PI_mean": [],
         "OI_mean": [],
+        "PAC_mean": [],
         "cohen_d_max": [],
     }
     for task in TASKS_to_include:

From eae402a7e818bb7befe8663e4d94e3de01c8cb09 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 27 Nov 2025 15:31:11 -0500
Subject: [PATCH 302/401] add rest and task durations original

---
 .../performance_predict.py                    | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_predict.py b/task_dFC/multi_dataset_analysis/performance_predict.py
index 6db4136..36ea679 100644
--- a/task_dFC/multi_dataset_analysis/performance_predict.py
+++ b/task_dFC/multi_dataset_analysis/performance_predict.py
@@ -81,6 +81,8 @@
 
     task_ratio_all = {}
     transition_freq_all = {}
+    rest_durations_original_all = {}
+    task_durations_original_all = {}
     rest_durations_all = {}
     task_durations_all = {}
     PI_all = {}
@@ -147,14 +149,21 @@
                         num_of_transitions, relative_transition_freq = (
                             calc_transition_freq(task_presence[indices])
                         )
-                        # calculate rest and task durations based original event labels
+                        # calculate rest and task durations based on original event labels
                         event_labels = np.multiply(task_data["event_labels"] != 0, 1)
-                        rest_durations = calc_rest_duration(
+                        rest_durations_original = calc_rest_duration(
                             event_labels, TR_mri=1 / task_data["Fs_task"]
                         )
-                        task_durations = calc_task_duration(
+                        task_durations_original = calc_task_duration(
                             event_labels, TR_mri=1 / task_data["Fs_task"]
                         )
+                        # calculate rest and task durations based on binary task presence
+                        rest_durations = calc_rest_duration(
+                            task_presence[indices], TR_mri=task_data["TR_mri"]
+                        )
+                        task_durations = calc_task_duration(
+                            task_presence[indices], TR_mri=task_data["TR_mri"]
+                        )
                         # Periodicity Index (low entropy => high periodicity)
                         out = compute_periodicity_index(
                             event_labels=event_labels,
@@ -182,6 +191,10 @@
                             task_ratio_all[task] = []
                         if not task in transition_freq_all:
                             transition_freq_all[task] = []
+                        if not task in rest_durations_original_all:
+                            rest_durations_original_all[task] = []
+                        if not task in task_durations_original_all:
+                            task_durations_original_all[task] = []
                         if not task in rest_durations_all:
                             rest_durations_all[task] = []
                         if not task in task_durations_all:
@@ -195,6 +208,8 @@
                         task_ratio_all[task].append(relative_task_on)
                         transition_freq_all[task].append(relative_transition_freq)
                         # rest_durations and task_durations are lists
+                        rest_durations_original_all[task].extend(rest_durations_original)
+                        task_durations_original_all[task].extend(task_durations_original)
                         rest_durations_all[task].extend(rest_durations)
                         task_durations_all[task].extend(task_durations)
                         PI_all[task].append(PI)
@@ -204,6 +219,8 @@
     task_design_features = {
         "task_ratio_all": task_ratio_all,
         "transition_freq_all": transition_freq_all,
+        "rest_durations_original_all": rest_durations_original_all,
+        "task_durations_original_all": task_durations_original_all,
         "rest_durations_all": rest_durations_all,
         "task_durations_all": task_durations_all,
         "PI_all": PI_all,
@@ -443,6 +460,10 @@
         "task": [],
         "task_ratio": [],
         "transition_freq": [],
+        "rest_durations_original_mean": [],
+        "task_durations_original_mean": [],
+        "rest_durations_original_std": [],
+        "task_durations_original_std": [],
         "rest_durations_mean": [],
         "task_durations_mean": [],
         "rest_durations_std": [],
@@ -455,6 +476,18 @@
     for task in TASKS_to_include:
         task_ratio = np.mean(task_design_features["task_ratio_all"][task])
         transition_freq = np.mean(task_design_features["transition_freq_all"][task])
+        rest_durations_original_mean = np.mean(
+            task_design_features["rest_durations_original_all"][task]
+        )
+        task_durations_original_mean = np.mean(
+            task_design_features["task_durations_original_all"][task]
+        )
+        rest_durations_original_std = np.std(
+            task_design_features["rest_durations_original_all"][task]
+        )
+        task_durations_original_std = np.std(
+            task_design_features["task_durations_original_all"][task]
+        )
         rest_durations_mean = np.mean(task_design_features["rest_durations_all"][task])
         task_durations_mean = np.mean(task_design_features["task_durations_all"][task])
         rest_durations_std = np.std(task_design_features["rest_durations_all"][task])
@@ -467,6 +500,10 @@
         DATA["task"].append(task)
         DATA["task_ratio"].append(task_ratio)
         DATA["transition_freq"].append(transition_freq)
+        DATA["rest_durations_original_mean"].append(rest_durations_original_mean)
+        DATA["task_durations_original_mean"].append(task_durations_original_mean)
+        DATA["rest_durations_original_std"].append(rest_durations_original_std)
+        DATA["task_durations_original_std"].append(task_durations_original_std)
         DATA["rest_durations_mean"].append(rest_durations_mean)
         DATA["task_durations_mean"].append(task_durations_mean)
         DATA["rest_durations_std"].append(rest_durations_std)

From 3dbb63b81ca5e538aca2b7b3b2dc7a57f46def32 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 28 Nov 2025 13:39:16 -0500
Subject: [PATCH 303/401] minor

---
 .../sample_matrix_visualization.py              | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 3e36243..22fb387 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -265,6 +265,23 @@
                         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
                         show=False,
                     )
+                    if (
+                        task == "task-localiser"
+                        and group == "train"
+                        and raw_or_embedded == ""
+                        and simul_or_real == "real"
+                        and (
+                            measure_name == "SlidingWindow" or measure_name == "Time-Freq"
+                        )
+                    ):
+                        plot_samples_features(
+                            X,
+                            y,
+                            sample_order="label",
+                            feature_order="original",
+                            save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.svg",
+                            show=False,
+                        )
 
                     # C) Label + within-class clustering + t-stat top bar
                     if group == "train":

From 506305039987473fa9813d2b69bccaa092a59414 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 1 Dec 2025 17:19:25 -0500
Subject: [PATCH 304/401] add pattern distinctiveness

---
 .../helper_functions.py                       | 48 +++++++++++++++++++
 .../sample_matrix_visualization.py            | 32 +++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 6ff8874..995af8e 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -14,6 +14,7 @@
 from matplotlib.colors import ListedColormap
 from scipy.cluster.hierarchy import leaves_list, linkage
 from scipy.stats import ttest_ind
+from sklearn.neighbors import NearestNeighbors
 
 ###################### Publication style ######################
 
@@ -1201,3 +1202,50 @@ def save_scalar_colorbar(
 
     fig.savefig(filename, bbox_inches="tight", pad_inches=0.02)
     plt.close(fig)
+
+
+def nearest_neighbor_match(X, y):
+    """Compute the fraction of samples whose nearest neighbor (by correlation) has the same class label."""
+    label_match = []
+    for i, sample in enumerate(X):
+        class_label = y[i]
+        # find the nearest sample using nearest neighbor
+        nbrs = NearestNeighbors(
+            n_neighbors=2, algorithm="auto", metric="correlation"
+        ).fit(X)
+        indices = nbrs.kneighbors(sample.reshape(1, -1), return_distance=False)
+        nearest_index = indices[0][1]  # index 0 is the sample itself
+        # find the label of the nearest sample
+        nearest_label = y[nearest_index]
+        label_match.append(class_label == nearest_label)
+    return np.mean(label_match)
+
+
+def other_class_max_corr(X, y):
+    """Compute, for each sample, the maximum correlation with samples from the other class.
+    Return summary statistics: median, fraction above 0.9, 95th percentile, and fraction of z-scores > 1.645.
+    """
+    all_corrs = []
+    for i, sample in enumerate(X):
+        class_label = y[i]
+        other_class_label = 1 - class_label
+        # find the correlation of that sample with each of the samples from the other class
+        corrs = [
+            np.corrcoef(sample.flatten(), other_sample.flatten())[0, 1]
+            for j, other_sample in enumerate(X)
+            if y[j] == other_class_label
+        ]
+        all_corrs.append(np.max(corrs))
+
+    all_corrs = np.asarray(all_corrs)
+    median = np.median(all_corrs)
+    above_90 = np.mean(np.array(all_corrs) > 0.9)
+    percentile_95 = np.percentile(all_corrs, 95)
+
+    mean_corr = all_corrs.mean()
+    std_corr = all_corrs.std(ddof=1)  # unbiased std
+
+    z = (all_corrs - mean_corr) / std_corr
+    high_frac = np.mean(z > 1.645) * 100
+
+    return median, above_90, percentile_95, high_frac
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 22fb387..251e5e9 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -14,6 +14,8 @@
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
+    nearest_neighbor_match,
+    other_class_max_corr,
     plot_samples_features,
     save_scalar_colorbar,
 )
@@ -68,6 +70,15 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
+    pattern_distinctiveness = {
+        "dFC method": [],
+        "task": [],
+        "NN_label_match": [],
+        "other_class_max_corr_median": [],
+        "other_class_max_corr_above_90": [],
+        "other_class_max_corr_95th_percentile": [],
+        "other_class_max_corr_high_frac": [],
+    }
     for dataset in DATASETS:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
         roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
@@ -238,6 +249,21 @@
                     continue
 
                 # np.save(f"{output_root}/processed_data/{dataset}_{task}_{measure_name}.npy", DATA[task])
+                NN_label_match = nearest_neighbor_match(X_train, y_train)
+                median, above_90, percentile_95, high_frac = other_class_max_corr(
+                    X_train, y_train
+                )
+                pattern_distinctiveness["dFC method"].append(measure_name)
+                pattern_distinctiveness["task"].append(task)
+                pattern_distinctiveness["NN_label_match"].append(NN_label_match)
+                pattern_distinctiveness["other_class_max_corr_median"].append(median)
+                pattern_distinctiveness["other_class_max_corr_above_90"].append(above_90)
+                pattern_distinctiveness["other_class_max_corr_95th_percentile"].append(
+                    percentile_95
+                )
+                pattern_distinctiveness["other_class_max_corr_high_frac"].append(
+                    high_frac
+                )
 
                 for group, X, y in zip(
                     ["train", "test"], [X_train, X_test], [y_train, y_test]
@@ -312,3 +338,9 @@
                         label="z-scored feature value",
                         filename=f"{output_root}/zscore_colorbar.png",
                     )
+
+    # Save pattern distinctiveness results
+    np.save(
+        f"{output_root}/pattern_distinctiveness{raw_or_embedded}.npy",
+        pattern_distinctiveness,
+    )

From 5dfa77c79ca0129637dd2c7e2cbd5567120829e9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 1 Dec 2025 23:16:44 -0500
Subject: [PATCH 305/401] boost other_class_max_corr

---
 .../helper_functions.py                       | 61 +++++++++++++------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 995af8e..ae3a19b 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -938,7 +938,7 @@ def plot_samples_features(
     X,
     y,
     *,
-    sample_order="original",  # "original" | "label" | "label+cluster"
+    sample_order="original",  # "original" | "label" | "label+cluster" | "cluster"
     feature_order="original",  # "original" | "tstat"
     col_order_from_train=None,  # optional np.ndarray (feature indices) to reuse on test
     ZSCORE=True,
@@ -1007,6 +1007,20 @@ def order_rows(A):
         row_order = np.r_[rest_order, task_order]
         split = len(rest_order)
         draw_separator = True
+    elif sample_order == "cluster":
+
+        def order_rows(A):
+            if len(A) <= 2:
+                return np.arange(len(A))
+            return leaves_list(linkage(A, method="average", metric="cosine"))
+
+        all_idx = np.arange(n_samples)
+        # rest_order = rest_idx[order_rows(Xz[rest_idx])] if len(rest_idx) else rest_idx
+        # task_order = task_idx[order_rows(Xz[task_idx])] if len(task_idx) else task_idx
+
+        row_order = all_idx[order_rows(Xz[all_idx])]
+        split = np.sum(y == 0)
+        draw_separator = False
     else:
         raise ValueError(
             "sample_order must be one of {'original','label','label+cluster'}"
@@ -1222,30 +1236,41 @@ def nearest_neighbor_match(X, y):
 
 
 def other_class_max_corr(X, y):
-    """Compute, for each sample, the maximum correlation with samples from the other class.
-    Return summary statistics: median, fraction above 0.9, 95th percentile, and fraction of z-scores > 1.645.
     """
-    all_corrs = []
-    for i, sample in enumerate(X):
-        class_label = y[i]
-        other_class_label = 1 - class_label
-        # find the correlation of that sample with each of the samples from the other class
-        corrs = [
-            np.corrcoef(sample.flatten(), other_sample.flatten())[0, 1]
-            for j, other_sample in enumerate(X)
-            if y[j] == other_class_label
-        ]
-        all_corrs.append(np.max(corrs))
+    Fast computation of max cross-class correlation per sample and summary stats.
+    X: array of shape (n_samples, n_features)
+    y: array of 0/1 labels
+    """
+    X = np.asarray(X)
+    y = np.asarray(y)
+
+    # 1) Normalize each sample to zero-mean, unit-norm (required for correlation)
+    X_norm = X - X.mean(axis=1, keepdims=True)
+    X_norm /= np.linalg.norm(X_norm, axis=1, keepdims=True) + 1e-12  # avoid div-by-zero
+
+    # 2) Compute full correlation matrix using dot product
+    #    correlation = (x_i ⋅ x_j) after normalization
+    corr_matrix = X_norm @ X_norm.T  # shape (N, N)
+
+    # 3) For each sample, extract correlations to the other class only
+    y = y.astype(int)
+    other_mask = y[:, None] != y[None, :]  # shape (N, N): True where classes differ
+
+    # Mask out same-class or self-corrs
+    cross_corrs = np.where(other_mask, corr_matrix, -np.inf)
+
+    # 4) Max correlation for each sample with the other class
+    all_corrs = np.max(cross_corrs, axis=1)
 
-    all_corrs = np.asarray(all_corrs)
+    # ---- Summary statistics ----
     median = np.median(all_corrs)
-    above_90 = np.mean(np.array(all_corrs) > 0.9)
+    above_90 = np.mean(all_corrs > 0.9)
     percentile_95 = np.percentile(all_corrs, 95)
 
     mean_corr = all_corrs.mean()
-    std_corr = all_corrs.std(ddof=1)  # unbiased std
+    std_corr = all_corrs.std(ddof=1)
 
-    z = (all_corrs - mean_corr) / std_corr
+    z = (all_corrs - mean_corr) / (std_corr + 1e-12)
     high_frac = np.mean(z > 1.645) * 100
 
     return median, above_90, percentile_95, high_frac

From 34619f40013e9709dfbdb72de03de15abf13a1d3 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 2 Dec 2025 20:10:09 -0500
Subject: [PATCH 306/401] improve other_class_max_corr and
 nearest_neighbor_match

---
 .../helper_functions.py                       | 110 ++++++++++++------
 .../sample_matrix_visualization.py            |  26 ++++-
 2 files changed, 98 insertions(+), 38 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index ae3a19b..1cbefbe 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1218,49 +1218,93 @@ def save_scalar_colorbar(
     plt.close(fig)
 
 
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+
+
 def nearest_neighbor_match(X, y):
-    """Compute the fraction of samples whose nearest neighbor (by correlation) has the same class label."""
-    label_match = []
-    for i, sample in enumerate(X):
-        class_label = y[i]
-        # find the nearest sample using nearest neighbor
-        nbrs = NearestNeighbors(
-            n_neighbors=2, algorithm="auto", metric="correlation"
-        ).fit(X)
-        indices = nbrs.kneighbors(sample.reshape(1, -1), return_distance=False)
-        nearest_index = indices[0][1]  # index 0 is the sample itself
-        # find the label of the nearest sample
-        nearest_label = y[nearest_index]
-        label_match.append(class_label == nearest_label)
-    return np.mean(label_match)
-
-
-def other_class_max_corr(X, y):
     """
-    Fast computation of max cross-class correlation per sample and summary stats.
-    X: array of shape (n_samples, n_features)
-    y: array of 0/1 labels
+    Compute fraction of matching labels for k=1,5,10 nearest neighbors.
+    For k>1, this returns fraction of *all neighbor votes* that match the label.
     """
     X = np.asarray(X)
     y = np.asarray(y)
 
-    # 1) Normalize each sample to zero-mean, unit-norm (required for correlation)
-    X_norm = X - X.mean(axis=1, keepdims=True)
-    X_norm /= np.linalg.norm(X_norm, axis=1, keepdims=True) + 1e-12  # avoid div-by-zero
+    # Fit once with max k
+    max_k = 10
+    nbrs = NearestNeighbors(
+        n_neighbors=max_k + 1,
+        metric="correlation",
+        algorithm="auto",
+    ).fit(X)
+
+    # Compute neighbors for all samples once
+    indices = nbrs.kneighbors(X, return_distance=False)[:, 1:]  # drop self
+
+    # Labels of all neighbors: shape (N, 10)
+    neighbor_labels = y[indices]
+
+    # Expand y to shape (N,1) for vectorized comparison
+    y_col = y.reshape(-1, 1)
 
-    # 2) Compute full correlation matrix using dot product
-    #    correlation = (x_i ⋅ x_j) after normalization
-    corr_matrix = X_norm @ X_norm.T  # shape (N, N)
+    # Boolean match matrix: (N,10)
+    match_matrix = neighbor_labels == y_col
 
-    # 3) For each sample, extract correlations to the other class only
-    y = y.astype(int)
-    other_mask = y[:, None] != y[None, :]  # shape (N, N): True where classes differ
+    # Compute metrics
+    match_1 = np.mean(match_matrix[:, :1])  # first neighbor
+    match_5 = np.mean(match_matrix[:, :5])  # first 5 neighbors
+    match_10 = np.mean(match_matrix[:, :10])  # all 10 neighbors
 
-    # Mask out same-class or self-corrs
-    cross_corrs = np.where(other_mask, corr_matrix, -np.inf)
+    return match_1, match_5, match_10
 
-    # 4) Max correlation for each sample with the other class
-    all_corrs = np.max(cross_corrs, axis=1)
+
+def other_class_max_corr(X, y, method="fast"):
+    """
+    Fast computation of max cross-class correlation per sample and summary stats.
+    X: array of shape (n_samples, n_features)
+    y: array of 0/1 labels
+    method: "slow" (loop-based) or "fast" (matrix-based)
+    use slow when n_samples is large and memory is limited; fast is much quicker for moderate n_samples
+    Returns: median, fraction_above_0.9, 95th_percentile, fraction_z_gt_1.645
+    """
+    X = np.asarray(X)
+    y = np.asarray(y)
+
+    if method == "slow":
+        all_corrs = []
+        for i, sample in enumerate(X):
+            class_label = y[i]
+            other_class_label = 1 - class_label
+            # find the correlation of that sample with each of the samples from the other class
+            corrs = [
+                np.corrcoef(sample.flatten(), other_sample.flatten())[0, 1]
+                for j, other_sample in enumerate(X)
+                if y[j] == other_class_label
+            ]
+            all_corrs.append(np.max(corrs))
+        all_corrs = np.asarray(all_corrs)
+    elif method == "fast":
+        # 1) Normalize each sample to zero-mean, unit-norm (required for correlation)
+        X_norm = X - X.mean(axis=1, keepdims=True)
+        X_norm /= (
+            np.linalg.norm(X_norm, axis=1, keepdims=True) + 1e-12
+        )  # avoid div-by-zero
+
+        # 2) Compute full correlation matrix using dot product
+        #    correlation = (x_i ⋅ x_j) after normalization
+        corr_matrix = X_norm @ X_norm.T  # shape (N, N)
+
+        # 3) For each sample, extract correlations to the other class only
+        y = y.astype(int)
+        other_mask = y[:, None] != y[None, :]  # shape (N, N): True where classes differ
+
+        # Mask out same-class or self-corrs
+        cross_corrs = np.where(other_mask, corr_matrix, -np.inf)
+
+        # 4) Max correlation for each sample with the other class
+        all_corrs = np.max(cross_corrs, axis=1)
+    else:
+        raise ValueError("method must be 'slow' or 'fast'")
 
     # ---- Summary statistics ----
     median = np.median(all_corrs)
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 251e5e9..d58cb92 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -73,7 +73,9 @@
     pattern_distinctiveness = {
         "dFC method": [],
         "task": [],
-        "NN_label_match": [],
+        "NN1_label_match": [],
+        "NN5_label_match": [],
+        "NN10_label_match": [],
         "other_class_max_corr_median": [],
         "other_class_max_corr_above_90": [],
         "other_class_max_corr_95th_percentile": [],
@@ -249,13 +251,27 @@
                     continue
 
                 # np.save(f"{output_root}/processed_data/{dataset}_{task}_{measure_name}.npy", DATA[task])
-                NN_label_match = nearest_neighbor_match(X_train, y_train)
-                median, above_90, percentile_95, high_frac = other_class_max_corr(
-                    X_train, y_train
+                NN1_label_match, NN5_label_match, NN10_label_match = (
+                    nearest_neighbor_match(X_train, y_train)
                 )
+                if task == "task-paingen":
+                    # due to memory issue, use the slow version for this task
+                    median, above_90, percentile_95, high_frac = other_class_max_corr(
+                        X_train,
+                        y_train,
+                        method="slow",
+                    )
+                else:
+                    median, above_90, percentile_95, high_frac = other_class_max_corr(
+                        X_train,
+                        y_train,
+                        method="fast",
+                    )
                 pattern_distinctiveness["dFC method"].append(measure_name)
                 pattern_distinctiveness["task"].append(task)
-                pattern_distinctiveness["NN_label_match"].append(NN_label_match)
+                pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)
+                pattern_distinctiveness["NN5_label_match"].append(NN5_label_match)
+                pattern_distinctiveness["NN10_label_match"].append(NN10_label_match)
                 pattern_distinctiveness["other_class_max_corr_median"].append(median)
                 pattern_distinctiveness["other_class_max_corr_above_90"].append(above_90)
                 pattern_distinctiveness["other_class_max_corr_95th_percentile"].append(

From 13777370540f44f4ebfcfe32f08a87509030b025 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 2 Dec 2025 20:12:38 -0500
Subject: [PATCH 307/401] minor

---
 task_dFC/multi_dataset_analysis/helper_functions.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 1cbefbe..6e54314 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1218,10 +1218,6 @@ def save_scalar_colorbar(
     plt.close(fig)
 
 
-import numpy as np
-from sklearn.neighbors import NearestNeighbors
-
-
 def nearest_neighbor_match(X, y):
     """
     Compute fraction of matching labels for k=1,5,10 nearest neighbors.

From be9b9d3c4d2c2314e064c15db668bb5fc6eb772e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 3 Dec 2025 10:41:02 -0500
Subject: [PATCH 308/401] minor

---
 .../sample_matrix_visualization.py            | 45 +++++++++++--------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index d58cb92..00ec484 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -126,6 +126,10 @@
                             dFC_id=dFC_id,
                         )
 
+                        if task == "task-paingen":
+                            # due to computational load, only use 120 subjects for this task
+                            SUBJECTS = SUBJECTS[:120]
+
                         # randomly select train_test_ratio of the subjects for training
                         # and rest for testing using numpy.random.choice
                         train_subjects = np.random.choice(
@@ -254,19 +258,24 @@
                 NN1_label_match, NN5_label_match, NN10_label_match = (
                     nearest_neighbor_match(X_train, y_train)
                 )
-                if task == "task-paingen":
-                    # due to memory issue, use the slow version for this task
-                    median, above_90, percentile_95, high_frac = other_class_max_corr(
-                        X_train,
-                        y_train,
-                        method="slow",
-                    )
-                else:
-                    median, above_90, percentile_95, high_frac = other_class_max_corr(
-                        X_train,
-                        y_train,
-                        method="fast",
-                    )
+                median, above_90, percentile_95, high_frac = other_class_max_corr(
+                    X_train,
+                    y_train,
+                    method="fast",
+                )
+                # if task == "task-paingen":
+                #     # due to memory issue, use the slow version for this task
+                #     median, above_90, percentile_95, high_frac = other_class_max_corr(
+                #         X_train,
+                #         y_train,
+                #         method="slow",
+                #     )
+                # else:
+                #     median, above_90, percentile_95, high_frac = other_class_max_corr(
+                #         X_train,
+                #         y_train,
+                #         method="fast",
+                #     )
                 pattern_distinctiveness["dFC method"].append(measure_name)
                 pattern_distinctiveness["task"].append(task)
                 pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)
@@ -355,8 +364,8 @@
                         filename=f"{output_root}/zscore_colorbar.png",
                     )
 
-    # Save pattern distinctiveness results
-    np.save(
-        f"{output_root}/pattern_distinctiveness{raw_or_embedded}.npy",
-        pattern_distinctiveness,
-    )
+        # Save pattern distinctiveness results
+        np.save(
+            f"{output_root}/pattern_distinctiveness{raw_or_embedded}.npy",
+            pattern_distinctiveness,
+        )

From f2fcd0cfecd51f0c9a02148f6ea885713b9c9030 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 4 Dec 2025 13:05:01 -0500
Subject: [PATCH 309/401] modify cog domains

---
 .../helper_functions.py                       | 160 +++++++-----------
 1 file changed, 59 insertions(+), 101 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 6e54314..89de251 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -72,105 +72,64 @@ def get_cog_domain_info(simul_or_real: str):
     if simul_or_real == "real":
         # --- Cognitive-Atlas–aligned domains (order on paper) ---
         DOMAIN_ORDER = [
-            "Language",
-            "Numerical cognition",
-            "Cognitive control",
-            "Working memory",
-            "Attention",
-            "Decision-making & valuation",
-            "Emotion & social processes",
-            "Cue reactivity / craving",
-            "Pain / nociception",
-            "Sensorimotor",
-            "Perception & naturalistic memory",
-            "Neurofeedback",
-            "Functional localizers",
-            "Other",
+            "Arousal and Regulatory Systems",
+            "Cognitive Systems",
+            "Negative Valence System",
+            "Positive Valence System",
+            "Sensorimotor Systems",
         ]
 
         # --- Map canonical task codes -> domain ---
         TASK2DOMAIN = {
-            # Language
-            "audrhyme": "Language",
-            "audsem": "Language",
-            "audspell": "Language",
-            "visrhyme": "Language",
-            "vissem": "Language",
-            "visspell": "Language",
-            "speech": "Language",
-            # Numerical
-            "arithmetic": "Numerical cognition",
-            # Cognitive control
-            "stroop": "Cognitive control",
-            "gstroop": "Cognitive control",
-            "cuedts": "Cognitive control",
-            "axcpt": "Cognitive control",
-            "matching": "Cognitive control",
-            # Working memory
-            "stern": "Working memory",
-            "vswm": "Working memory",
-            "workingmemory": "Working memory",
-            # Attention
-            "spatialdetection": "Attention",
-            "oddball": "Attention",
-            # Decision-making & valuation
-            "bart": "Decision-making & valuation",
-            "risk": "Decision-making & valuation",
-            "itc": "Decision-making & valuation",
-            "delaydiscounting": "Decision-making & valuation",
-            "mgt": "Decision-making & valuation",
-            # Emotion & social
-            "emomatching": "Emotion & social processes",
-            "anticipation": "Emotion & social processes",
-            "fearlearning": "Emotion & social processes",
-            "emotionregulation": "Emotion & social processes",
-            "faces": "Emotion & social processes",
-            # Cue reactivity
-            "cic": "Cue reactivity / craving",
-            # Pain
-            "paingen": "Pain / nociception",
+            # Language & Regulatory Systems
+            "emotionregulation": "Arousal & Regulatory Systems",
+            # Cognitive Systems
+            "audsem": "Cognitive Systems",
+            "visrhyme": "Cognitive Systems",
+            "vissem": "Cognitive Systems",
+            "visspell": "Cognitive Systems",
+            "arithmetic": "Cognitive Systems",
+            "stroop": "Cognitive Systems",
+            "cuedts": "Cognitive Systems",
+            "axcpt": "Cognitive Systems",
+            "matching": "Cognitive Systems",
+            "stern": "Cognitive Systems",
+            "st": "Cognitive Systems",
+            "vswm": "Cognitive Systems",
+            "expo": "Cognitive Systems",
+            "recall": "Cognitive Systems",
+            "feedback": "Cognitive Systems",
+            "ppalocalizer": "Cognitive Systems",
+            "localiser": "Cognitive Systems",
+            "localizer": "Cognitive Systems",
+            # Positive Valence System
+            "cic": "Positive Valence System",
+            "fribbids": "Positive Valence System",
+            "risk": "Positive Valence System",
+            "itc": "Positive Valence System",
+            # Negative Valence System
+            "fearlearning": "Negative Valence System",
+            "paingen": "Negative Valence System",
             # Sensorimotor
-            "motor": "Sensorimotor",
-            "execution": "Sensorimotor",
-            "imagery": "Sensorimotor",
-            "ihg": "Sensorimotor",
-            # Perception & naturalistic memory
-            "expo": "Perception & naturalistic memory",
-            "recall": "Perception & naturalistic memory",
-            # Methodological — Neurofeedback
-            "feedback": "Neurofeedback",
-            # Methodological — Functional localizers
-            "ppalocalizer": "Functional localizers",
-            "floc": "Functional localizers",
-            "fribbids": "Functional localizers",
-            "midloc": "Functional localizers",
-            "localiser": "Functional localizers",
-            "localizer": "Functional localizers",
+            "motor": "Sensorimotor Systems",
+            "execution": "Sensorimotor Systems",
+            "imagery": "Sensorimotor Systems",
+            "ihg": "Sensorimotor Systems",
         }
         # base colors per domain (distinct, colorblind-friendly)
         DOMAIN_BASE = {
-            "Language": "#1f77b4",
-            "Numerical cognition": "#ff7f0e",
-            "Cognitive control": "#02833E",
-            "Working memory": "#d62728",
-            "Attention": "#9467bd",
-            "Decision-making & valuation": "#8c564b",
-            "Emotion & social processes": "#e377c2",
-            "Cue reactivity / craving": "#D337D5",
-            "Pain / nociception": "#bcbd22",
-            "Sensorimotor": "#17becf",
-            "Perception & naturalistic memory": "#1f9e89",
-            "Neurofeedback": "#d0e81f",
-            "Functional localizers": "#35cf33",
-            "Other": "#646464",
+            "Arousal & Regulatory Systems": "#1f77b4",
+            "Cognitive Systems": "#ff7f0e",
+            "Positive Valence System": "#02833E",
+            "Negative Valence System": "#d62728",
+            "Sensorimotor Systems": "#9467bd",
         }
     elif simul_or_real == "simulated":
         # --- Categories of simulated task paradigms ---
         DOMAIN_ORDER = [
             "Simulated Periodic",
-            "Optimal Paradigm Design, Strong Performance on Real Data",
-            "Optimal Paradigm Design, Limited Performance on Real Data",
-            "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
+            "Strong Performance on Real Data",
+            "Weak Performance on Real Data",
         ]
         # --- Map task codes -> category ---
         TASK2DOMAIN = {
@@ -179,25 +138,24 @@ def get_cog_domain_info(simul_or_real: str):
             "lowfreqshortrest": "Simulated Periodic",
             "lowfreqshorttask": "Simulated Periodic",
             # Optimal Paradigm Design, Strong Performance on Real Data
-            "axcpt": "Optimal Paradigm Design, Strong Performance on Real Data",
-            "stern": "Optimal Paradigm Design, Strong Performance on Real Data",
-            "cuedts": "Optimal Paradigm Design, Strong Performance on Real Data",
-            # Optimal Paradigm Design, Limited Performance on Real Data
-            "execution": "Optimal Paradigm Design, Limited Performance on Real Data",
-            "imagery": "Optimal Paradigm Design, Limited Performance on Real Data",
-            "localizer": "Optimal Paradigm Design, Limited Performance on Real Data",
-            "ppalocalizer": "Optimal Paradigm Design, Limited Performance on Real Data",
-            # Sub-Optimal Paradigm Design, Limited Performance on Real Data
-            "itc": "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
-            "stroop": "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
-            "risk": "Sub-Optimal Paradigm Design, Limited Performance on Real Data",
+            "axcpt": "Strong Performance on Real Data",
+            "stern": "Strong Performance on Real Data",
+            "cuedts": "Strong Performance on Real Data",
+            # Optimal Paradigm Design, Weak Performance on Real Data
+            "execution": "Weak Performance on Real Data",
+            "imagery": "Weak Performance on Real Data",
+            "localizer": "Weak Performance on Real Data",
+            "ppalocalizer": "Weak Performance on Real Data",
+            # Sub-Optimal Paradigm Design, Weak Performance on Real Data
+            "itc": "Weak Performance on Real Data",
+            "stroop": "Weak Performance on Real Data",
+            "risk": "Weak Performance on Real Data",
         }
         # base colors per domain (distinct, colorblind-friendly)
         DOMAIN_BASE = {
             "Simulated Periodic": "#1f77b4",
-            "Optimal Paradigm Design, Strong Performance on Real Data": "#ff7f0e",
-            "Optimal Paradigm Design, Limited Performance on Real Data": "#02833E",
-            "Sub-Optimal Paradigm Design, Limited Performance on Real Data": "#d62728",
+            "Strong Performance on Real Data": "#02833E",
+            "Weak Performance on Real Data": "#d62728",
         }
     else:
         raise ValueError(f"Invalid simul_or_real: {simul_or_real}")

From 0b64b4e16adb0fb496ebccdd16dd67f7264a73dd Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 4 Dec 2025 13:06:35 -0500
Subject: [PATCH 310/401] minor

---
 .../sample_matrix_visualization.py                  | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 00ec484..dd470f2 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -263,19 +263,6 @@
                     y_train,
                     method="fast",
                 )
-                # if task == "task-paingen":
-                #     # due to memory issue, use the slow version for this task
-                #     median, above_90, percentile_95, high_frac = other_class_max_corr(
-                #         X_train,
-                #         y_train,
-                #         method="slow",
-                #     )
-                # else:
-                #     median, above_90, percentile_95, high_frac = other_class_max_corr(
-                #         X_train,
-                #         y_train,
-                #         method="fast",
-                #     )
                 pattern_distinctiveness["dFC method"].append(measure_name)
                 pattern_distinctiveness["task"].append(task)
                 pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)

From 368c17e6f0f36b2c2a09b41b0bcbe71969184160 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 4 Dec 2025 13:26:43 -0500
Subject: [PATCH 311/401] minor

---
 task_dFC/multi_dataset_analysis/helper_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 89de251..097a11b 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -72,7 +72,7 @@ def get_cog_domain_info(simul_or_real: str):
     if simul_or_real == "real":
         # --- Cognitive-Atlas–aligned domains (order on paper) ---
         DOMAIN_ORDER = [
-            "Arousal and Regulatory Systems",
+            "Arousal & Regulatory Systems",
             "Cognitive Systems",
             "Negative Valence System",
             "Positive Valence System",

From 11993fafd4d326955dec83dddf6837edc176fd07 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 4 Dec 2025 13:31:07 -0500
Subject: [PATCH 312/401] minor

---
 task_dFC/multi_dataset_analysis/helper_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 097a11b..1be93dc 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -118,11 +118,11 @@ def get_cog_domain_info(simul_or_real: str):
         }
         # base colors per domain (distinct, colorblind-friendly)
         DOMAIN_BASE = {
-            "Arousal & Regulatory Systems": "#1f77b4",
+            "Arousal & Regulatory Systems": "#9467bd",
             "Cognitive Systems": "#ff7f0e",
             "Positive Valence System": "#02833E",
             "Negative Valence System": "#d62728",
-            "Sensorimotor Systems": "#9467bd",
+            "Sensorimotor Systems": "#1f77b4",
         }
     elif simul_or_real == "simulated":
         # --- Categories of simulated task paradigms ---

From f700285c61011d67440cee9212a140e5f23d40bc Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Dec 2025 13:35:17 -0500
Subject: [PATCH 313/401] improve LE embed visual

---
 .../LE_embedding_visualization.py             | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
index 6a2298a..783c1a6 100644
--- a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
+++ b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
@@ -2,6 +2,7 @@
 import json
 import os
 
+import matplotlib as mpl
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.metrics import silhouette_score
@@ -165,18 +166,42 @@
                             print(X_embedded.shape)
 
                             # plot
+                            # ---- publication style (light touch) ----
+                            mpl.rcParams.update(
+                                {
+                                    "legend.fontsize": 10,
+                                    "axes.linewidth": 0.9,
+                                    "pdf.fonttype": 42,
+                                    "ps.fonttype": 42,  # keep text as text in PDF/SVG
+                                    "savefig.bbox": "tight",
+                                    "savefig.dpi": 300,
+                                    "figure.dpi": 150,
+                                }
+                            )
                             fig = plt.figure(figsize=(7, 7))
                             ax = fig.add_subplot(111, projection="3d")
+
+                            colors = ("#B1B1B1", "#2F5BD3")
+
                             for label in np.unique(y):
                                 ax.scatter(
                                     X_embedded[y == label, 0],
                                     X_embedded[y == label, 1],
                                     X_embedded[y == label, 2],
                                     label=["rest", "task"][label],
-                                    s=20,
+                                    s=50,
+                                    c=[colors[label]],
+                                    edgecolors="#202020",
+                                    linewidths=0.25,
+                                    depthshade=False,
                                 )
                             plt.legend()
 
+                            # remove tick labels
+                            ax.set_xticklabels([])
+                            ax.set_yticklabels([])
+                            ax.set_zticklabels([])
+
                             plt.savefig(
                                 f"{output_root}/LE_embed_{task}_{measure_name}.png",
                                 dpi=fig_dpi,

From 92477c16f10cf8865879f64d999c9e06341b4ba9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Dec 2025 17:22:52 -0500
Subject: [PATCH 314/401] minor

---
 .../multi_dataset_analysis/sample_matrix_visualization.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index dd470f2..294f307 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -130,6 +130,14 @@
                             # due to computational load, only use 120 subjects for this task
                             SUBJECTS = SUBJECTS[:120]
 
+                        if (
+                            task == "task-lowFreqLongRest"
+                            or task == "task-lowFreqShortRest"
+                            or task == "task-lowFreqShortTask"
+                        ):
+                            # due to computational load, only use 100 subjects for this task
+                            SUBJECTS = SUBJECTS[:100]
+
                         # randomly select train_test_ratio of the subjects for training
                         # and rest for testing using numpy.random.choice
                         train_subjects = np.random.choice(

From b23b334ea141ce53bcdd317f1964665642d0c4e4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 10 Dec 2025 20:26:57 -0500
Subject: [PATCH 315/401] use cosine for other_class_max_corr

---
 .../helper_functions.py                       | 95 +++++++++++++------
 .../sample_matrix_visualization.py            |  4 +-
 2 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 1be93dc..0799bce 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1212,62 +1212,97 @@ def nearest_neighbor_match(X, y):
     return match_1, match_5, match_10
 
 
-def other_class_max_corr(X, y, method="fast"):
+def other_class_max_corr(X, y, method="fast", metric="cosine"):
     """
-    Fast computation of max cross-class correlation per sample and summary stats.
-    X: array of shape (n_samples, n_features)
-    y: array of 0/1 labels
-    method: "slow" (loop-based) or "fast" (matrix-based)
-    use slow when n_samples is large and memory is limited; fast is much quicker for moderate n_samples
-    Returns: median, fraction_above_0.9, 95th_percentile, fraction_z_gt_1.645
+    Compute max cross-class similarity (Pearson or cosine) per sample and summary stats.
+
+    Parameters
+    ----------
+    X : array, shape (n_samples, n_features)
+    y : array, shape (n_samples,)
+    method : "slow" or "fast"
+        slow  = loop-based (OOM-safe but slow)
+        fast  = matrix-based (very fast but uses O(N^2) memory)
+    metric : "pearson" or "cosine"
+        pearson = correlation after mean-centering
+        cosine  = correlation without mean-centering
+
+    Returns
+    -------
+    median, fraction_above_0.9, 95th_percentile, fraction_z_gt_1.645
     """
-    X = np.asarray(X)
+    X = np.asarray(X, dtype=float)
     y = np.asarray(y)
 
+    # ======================================================
+    # Helper: compute normalized X depending on metric
+    # ======================================================
+    def normalize(X):
+        if metric == "pearson":
+            Xn = X - X.mean(axis=1, keepdims=True)
+        elif metric == "cosine":
+            Xn = X.copy()
+        else:
+            raise ValueError("metric must be 'pearson' or 'cosine'")
+
+        norms = np.linalg.norm(Xn, axis=1, keepdims=True) + 1e-12
+        return Xn / norms
+
+    # ======================================================
+    # SLOW METHOD (loop-based, safe for large N)
+    # ======================================================
     if method == "slow":
         all_corrs = []
         for i, sample in enumerate(X):
             class_label = y[i]
             other_class_label = 1 - class_label
-            # find the correlation of that sample with each of the samples from the other class
-            corrs = [
-                np.corrcoef(sample.flatten(), other_sample.flatten())[0, 1]
-                for j, other_sample in enumerate(X)
-                if y[j] == other_class_label
-            ]
+
+            # extract opposite-class samples
+            X_other = X[y == other_class_label]
+
+            # normalize sample and opposite-class samples based on metric
+            s = sample.reshape(1, -1)
+            s_norm = normalize(s)[0]
+            X_other_norm = normalize(X_other)
+
+            # dot products = cosine or Pearson similarity
+            corrs = X_other_norm @ s_norm
+
             all_corrs.append(np.max(corrs))
+
         all_corrs = np.asarray(all_corrs)
+
+    # ======================================================
+    # FAST METHOD (matrix multiplication)
+    # ======================================================
     elif method == "fast":
-        # 1) Normalize each sample to zero-mean, unit-norm (required for correlation)
-        X_norm = X - X.mean(axis=1, keepdims=True)
-        X_norm /= (
-            np.linalg.norm(X_norm, axis=1, keepdims=True) + 1e-12
-        )  # avoid div-by-zero
+        X_norm = normalize(X)  # normalize entire matrix according to metric
 
-        # 2) Compute full correlation matrix using dot product
-        #    correlation = (x_i ⋅ x_j) after normalization
-        corr_matrix = X_norm @ X_norm.T  # shape (N, N)
+        # similarity matrix
+        sim_matrix = X_norm @ X_norm.T  # (N × N)
 
-        # 3) For each sample, extract correlations to the other class only
+        # mask for opposite-class pairs
         y = y.astype(int)
-        other_mask = y[:, None] != y[None, :]  # shape (N, N): True where classes differ
+        other_mask = y[:, None] != y[None, :]
+
+        # mask same-class and diagonal
+        cross_sims = np.where(other_mask, sim_matrix, -np.inf)
 
-        # Mask out same-class or self-corrs
-        cross_corrs = np.where(other_mask, corr_matrix, -np.inf)
+        # max similarity to opposite class
+        all_corrs = np.max(cross_sims, axis=1)
 
-        # 4) Max correlation for each sample with the other class
-        all_corrs = np.max(cross_corrs, axis=1)
     else:
         raise ValueError("method must be 'slow' or 'fast'")
 
-    # ---- Summary statistics ----
+    # ======================================================
+    # Summary statistics
+    # ======================================================
     median = np.median(all_corrs)
     above_90 = np.mean(all_corrs > 0.9)
     percentile_95 = np.percentile(all_corrs, 95)
 
     mean_corr = all_corrs.mean()
     std_corr = all_corrs.std(ddof=1)
-
     z = (all_corrs - mean_corr) / (std_corr + 1e-12)
     high_frac = np.mean(z > 1.645) * 100
 
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 294f307..0fb7520 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -127,8 +127,8 @@
                         )
 
                         if task == "task-paingen":
-                            # due to computational load, only use 120 subjects for this task
-                            SUBJECTS = SUBJECTS[:120]
+                            # due to computational load, only use 100 subjects for this task
+                            SUBJECTS = SUBJECTS[:100]
 
                         if (
                             task == "task-lowFreqLongRest"

From 8e912b7afb012b6b0e2c3035054ac36afd5925b1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Dec 2025 12:06:14 -0500
Subject: [PATCH 316/401] add ldc_crossvalidated for sample specificity

---
 .../helper_functions.py                       | 296 +++++++++++++++---
 .../sample_matrix_visualization.py            |  30 +-
 2 files changed, 259 insertions(+), 67 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 0799bce..e3c5239 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1212,98 +1212,290 @@ def nearest_neighbor_match(X, y):
     return match_1, match_5, match_10
 
 
-def other_class_max_corr(X, y, method="fast", metric="cosine"):
+# def other_class_max_corr(X, y, method="fast", metric="cosine"):
+#     """
+#     Compute max cross-class similarity (Pearson or cosine) per sample and summary stats.
+
+#     Parameters
+#     ----------
+#     X : array, shape (n_samples, n_features)
+#     y : array, shape (n_samples,)
+#     method : "slow" or "fast"
+#         slow  = loop-based (OOM-safe but slow)
+#         fast  = matrix-based (very fast but uses O(N^2) memory)
+#     metric : "pearson" or "cosine"
+#         pearson = correlation after mean-centering
+#         cosine  = correlation without mean-centering
+
+#     Returns
+#     -------
+#     median, fraction_above_0.9, 95th_percentile, fraction_z_gt_1.645
+#     """
+#     X = np.asarray(X, dtype=float)
+#     y = np.asarray(y)
+
+#     # ======================================================
+#     # Helper: compute normalized X depending on metric
+#     # ======================================================
+#     def normalize(X):
+#         if metric == "pearson":
+#             Xn = X - X.mean(axis=1, keepdims=True)
+#         elif metric == "cosine":
+#             Xn = X.copy()
+#         else:
+#             raise ValueError("metric must be 'pearson' or 'cosine'")
+
+#         norms = np.linalg.norm(Xn, axis=1, keepdims=True) + 1e-12
+#         return Xn / norms
+
+#     # ======================================================
+#     # SLOW METHOD (loop-based, safe for large N)
+#     # ======================================================
+#     if method == "slow":
+#         all_corrs = []
+#         for i, sample in enumerate(X):
+#             class_label = y[i]
+#             other_class_label = 1 - class_label
+
+#             # extract opposite-class samples
+#             X_other = X[y == other_class_label]
+
+#             # normalize sample and opposite-class samples based on metric
+#             s = sample.reshape(1, -1)
+#             s_norm = normalize(s)[0]
+#             X_other_norm = normalize(X_other)
+
+#             # dot products = cosine or Pearson similarity
+#             corrs = X_other_norm @ s_norm
+
+#             all_corrs.append(np.max(corrs))
+
+#         all_corrs = np.asarray(all_corrs)
+
+#     # ======================================================
+#     # FAST METHOD (matrix multiplication)
+#     # ======================================================
+#     elif method == "fast":
+#         X_norm = normalize(X)  # normalize entire matrix according to metric
+
+#         # similarity matrix
+#         sim_matrix = X_norm @ X_norm.T  # (N × N)
+
+#         # mask for opposite-class pairs
+#         y = y.astype(int)
+#         other_mask = y[:, None] != y[None, :]
+
+#         # mask same-class and diagonal
+#         cross_sims = np.where(other_mask, sim_matrix, -np.inf)
+
+#         # max similarity to opposite class
+#         all_corrs = np.max(cross_sims, axis=1)
+
+#     else:
+#         raise ValueError("method must be 'slow' or 'fast'")
+
+#     # ======================================================
+#     # Summary statistics
+#     # ======================================================
+#     median = np.median(all_corrs)
+#     above_90 = np.mean(all_corrs > 0.9)
+#     percentile_95 = np.percentile(all_corrs, 95)
+
+#     mean_corr = all_corrs.mean()
+#     std_corr = all_corrs.std(ddof=1)
+#     z = (all_corrs - mean_corr) / (std_corr + 1e-12)
+#     high_frac = np.mean(z > 1.645) * 100
+
+#     return median, above_90, percentile_95, high_frac
+
+
+def other_class_corr(X, y, method="fast", metric="cosine"):
     """
-    Compute max cross-class similarity (Pearson or cosine) per sample and summary stats.
+    Compute cross-class similarity (Pearson or cosine) for ALL opposite-class pairs
+    (each pair counted only once) and summarize their distribution.
 
     Parameters
     ----------
     X : array, shape (n_samples, n_features)
     y : array, shape (n_samples,)
     method : "slow" or "fast"
-        slow  = loop-based (OOM-safe but slow)
-        fast  = matrix-based (very fast but uses O(N^2) memory)
+        slow  = loop-based (avoids full N×N matrix, but slower)
+        fast  = matrix-based (computes class0×class1 block)
     metric : "pearson" or "cosine"
-        pearson = correlation after mean-centering
-        cosine  = correlation without mean-centering
+        pearson = correlation after mean-centering each row
+        cosine  = cosine similarity without mean-centering
 
     Returns
     -------
-    median, fraction_above_0.9, 95th_percentile, fraction_z_gt_1.645
+    fraction_above_0.9, fraction_above_0.95, fraction_above_0.99, fraction_above_0.999
+    0.0–1.0 fractions over ALL unique cross-class similarities.
     """
     X = np.asarray(X, dtype=float)
     y = np.asarray(y)
 
-    # ======================================================
-    # Helper: compute normalized X depending on metric
-    # ======================================================
-    def normalize(X):
+    # ------------------------------------------------------
+    # Helper: row-wise normalization
+    # ------------------------------------------------------
+    def normalize_rows(X_):
         if metric == "pearson":
-            Xn = X - X.mean(axis=1, keepdims=True)
+            Xn = X_ - X_.mean(axis=1, keepdims=True)
         elif metric == "cosine":
-            Xn = X.copy()
+            Xn = X_.copy()
         else:
             raise ValueError("metric must be 'pearson' or 'cosine'")
 
         norms = np.linalg.norm(Xn, axis=1, keepdims=True) + 1e-12
         return Xn / norms
 
-    # ======================================================
-    # SLOW METHOD (loop-based, safe for large N)
-    # ======================================================
+    # Binary labels assumed: 0/1 (or two unique values)
+    unique_labels = np.unique(y)
+    if unique_labels.size != 2:
+        raise ValueError("This function assumes exactly two classes.")
+
+    label_a, label_b = unique_labels
+
+    idx_a = np.where(y == label_a)[0]
+    idx_b = np.where(y == label_b)[0]
+
+    # ------------------------------------------------------
+    # SLOW METHOD (loop over one class only)
+    # ------------------------------------------------------
     if method == "slow":
         all_corrs = []
-        for i, sample in enumerate(X):
-            class_label = y[i]
-            other_class_label = 1 - class_label
 
-            # extract opposite-class samples
-            X_other = X[y == other_class_label]
+        X_b = X[idx_b]
+        X_b_norm = normalize_rows(X_b)
 
-            # normalize sample and opposite-class samples based on metric
-            s = sample.reshape(1, -1)
-            s_norm = normalize(s)[0]
-            X_other_norm = normalize(X_other)
+        for i in idx_a:
+            s = X[i].reshape(1, -1)
+            s_norm = normalize_rows(s)[0]
 
-            # dot products = cosine or Pearson similarity
-            corrs = X_other_norm @ s_norm
+            # similarities between this sample and ALL samples in other class
+            corrs = X_b_norm @ s_norm  # shape: (n_b,)
 
-            all_corrs.append(np.max(corrs))
+            all_corrs.extend(corrs.tolist())
 
         all_corrs = np.asarray(all_corrs)
 
-    # ======================================================
-    # FAST METHOD (matrix multiplication)
-    # ======================================================
+    # ------------------------------------------------------
+    # FAST METHOD (block matrix classA × classB)
+    # ------------------------------------------------------
     elif method == "fast":
-        X_norm = normalize(X)  # normalize entire matrix according to metric
+        X_norm = normalize_rows(X)
 
-        # similarity matrix
-        sim_matrix = X_norm @ X_norm.T  # (N × N)
+        Xa = X_norm[idx_a]  # (n_a, F)
+        Xb = X_norm[idx_b]  # (n_b, F)
 
-        # mask for opposite-class pairs
-        y = y.astype(int)
-        other_mask = y[:, None] != y[None, :]
+        # similarities for all unique cross-class pairs (label_a vs label_b)
+        # shape: (n_a, n_b)
+        sim_block = Xa @ Xb.T
 
-        # mask same-class and diagonal
-        cross_sims = np.where(other_mask, sim_matrix, -np.inf)
-
-        # max similarity to opposite class
-        all_corrs = np.max(cross_sims, axis=1)
+        # flatten: each pair counted exactly once
+        all_corrs = sim_block.ravel()
 
     else:
         raise ValueError("method must be 'slow' or 'fast'")
 
-    # ======================================================
-    # Summary statistics
-    # ======================================================
-    median = np.median(all_corrs)
+    # ------------------------------------------------------
+    # Summary statistics over ALL unique cross-class similarities
+    # ------------------------------------------------------
     above_90 = np.mean(all_corrs > 0.9)
-    percentile_95 = np.percentile(all_corrs, 95)
+    above_95 = np.mean(all_corrs > 0.95)
+    above_99 = np.mean(all_corrs > 0.99)
+    above_999 = np.mean(all_corrs > 0.999)
+
+    return above_90, above_95, above_99, above_999
+
+
+from sklearn.covariance import LedoitWolf
+from sklearn.model_selection import StratifiedKFold
+
+
+def ldc_crossvalidated(X, y, n_splits=2, random_state=None):
+    """
+    Cross-validated Mahalanobis distance (LDC) between two classes.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Data matrix. Rows are samples, columns are features.
+    y : array-like, shape (n_samples,)
+        Class labels. Must contain exactly TWO unique labels.
+    n_splits : int, default=2
+        Number of CV splits (partitions). 2 is the classic LDC setup.
+    random_state : int or None
+        Seed for reproducible splits.
+
+    Returns
+    -------
+    ldc : float
+        Cross-validated Mahalanobis distance (LDC).
+        Unbiased estimate of squared distance between class means
+        in noise-whitened space (optionally divided by n_features).
+    pairwise_ldcs : np.ndarray, shape (n_pairs,)
+        LDC values for each fold-pair used in the averaging.
+    """
+    X = np.asarray(X, dtype=float)
+    y = np.asarray(y)
+
+    # ----- 1) Check binary labels -----
+    unique_labels = np.unique(y)
+    if unique_labels.size != 2:
+        raise ValueError("ldc_crossvalidated currently assumes exactly two classes.")
+    label_a, label_b = unique_labels
+
+    n_samples, n_features = X.shape
+
+    # ----- 2) Estimate noise covariance (within-class) -----
+    # Subtract class means to get residuals, then estimate covariance on residuals.
+    X_resid = X.copy()
+    for lbl in unique_labels:
+        mask = y == lbl
+        X_resid[mask] -= X_resid[mask].mean(axis=0, keepdims=True)
+
+    # Shrinkage covariance for stability in high-dim / low-n regimes
+    lw = LedoitWolf().fit(X_resid)
+    Sigma = lw.covariance_
+
+    # ----- 3) Compute whitening transform W such that Sigma^{-1} = W^T W -----
+    # Using Cholesky: Sigma = L L^T  =>  Sigma^{-1} = L^{-T} L^{-1}
+    # Choose W = L^{-1}; then W^T W = Sigma^{-1}.
+    L = np.linalg.cholesky(Sigma)
+    # Solve L * M = I for M, then W = M
+    W = np.linalg.inv(L)  # (n_features x n_features)
+
+    # Whitened data
+    Xw = X @ W  # each row is whitened pattern
+
+    # ----- 4) Cross-validation splits -----
+    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+
+    deltas = []  # mean-difference vectors per fold in whitened space
+
+    for train_idx, _ in skf.split(Xw, y):
+        X_fold = Xw[train_idx]
+        y_fold = y[train_idx]
+
+        # class means in this fold
+        mu_a = X_fold[y_fold == label_a].mean(axis=0)
+        mu_b = X_fold[y_fold == label_b].mean(axis=0)
+
+        delta = mu_a - mu_b  # difference of class means
+        deltas.append(delta)
+
+    deltas = np.vstack(deltas)  # shape (n_splits, n_features)
+
+    # ----- 5) Cross-validated Mahalanobis distance (LDC) -----
+    # For each pair of independent partitions f != g:
+    #   LDC_fg = delta_f^T delta_g / n_features
+    # Average over all unique pairs.
+    pairwise_ldcs = []
+    for i in range(len(deltas)):
+        for j in range(i + 1, len(deltas)):
+            ldc_ij = np.dot(deltas[i], deltas[j]) / n_features
+            pairwise_ldcs.append(ldc_ij)
 
-    mean_corr = all_corrs.mean()
-    std_corr = all_corrs.std(ddof=1)
-    z = (all_corrs - mean_corr) / (std_corr + 1e-12)
-    high_frac = np.mean(z > 1.645) * 100
+    pairwise_ldcs = np.asarray(pairwise_ldcs)
+    ldc = pairwise_ldcs.mean()
 
-    return median, above_90, percentile_95, high_frac
+    return ldc
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 0fb7520..38d7a79 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -14,8 +14,9 @@
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
+    ldc_crossvalidated,
     nearest_neighbor_match,
-    other_class_max_corr,
+    other_class_corr,
     plot_samples_features,
     save_scalar_colorbar,
 )
@@ -76,10 +77,11 @@
         "NN1_label_match": [],
         "NN5_label_match": [],
         "NN10_label_match": [],
-        "other_class_max_corr_median": [],
-        "other_class_max_corr_above_90": [],
-        "other_class_max_corr_95th_percentile": [],
-        "other_class_max_corr_high_frac": [],
+        "other_class_corr_above_90": [],
+        "other_class_corr_above_95": [],
+        "other_class_corr_above_99": [],
+        "other_class_corr_above_999": [],
+        "LDC": [],
     }
     for dataset in DATASETS:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
@@ -266,24 +268,22 @@
                 NN1_label_match, NN5_label_match, NN10_label_match = (
                     nearest_neighbor_match(X_train, y_train)
                 )
-                median, above_90, percentile_95, high_frac = other_class_max_corr(
+                above_90, above_95, above_99, above_999 = other_class_corr(
                     X_train,
                     y_train,
                     method="fast",
                 )
+                ldc = ldc_crossvalidated(X_train, y_train, n_splits=4)
                 pattern_distinctiveness["dFC method"].append(measure_name)
                 pattern_distinctiveness["task"].append(task)
                 pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)
                 pattern_distinctiveness["NN5_label_match"].append(NN5_label_match)
                 pattern_distinctiveness["NN10_label_match"].append(NN10_label_match)
-                pattern_distinctiveness["other_class_max_corr_median"].append(median)
-                pattern_distinctiveness["other_class_max_corr_above_90"].append(above_90)
-                pattern_distinctiveness["other_class_max_corr_95th_percentile"].append(
-                    percentile_95
-                )
-                pattern_distinctiveness["other_class_max_corr_high_frac"].append(
-                    high_frac
-                )
+                pattern_distinctiveness["other_class_corr_above_90"].append(above_90)
+                pattern_distinctiveness["other_class_corr_above_95"].append(above_95)
+                pattern_distinctiveness["other_class_corr_above_99"].append(above_99)
+                pattern_distinctiveness["other_class_corr_above_999"].append(above_999)
+                pattern_distinctiveness["LDC"].append(ldc)
 
                 for group, X, y in zip(
                     ["train", "test"], [X_train, X_test], [y_train, y_test]
@@ -361,6 +361,6 @@
 
         # Save pattern distinctiveness results
         np.save(
-            f"{output_root}/pattern_distinctiveness{raw_or_embedded}.npy",
+            f"{output_root}/pattern_distinctiveness_{simul_or_real}{raw_or_embedded}.npy",
             pattern_distinctiveness,
         )

From 85bce1736365b0f776582697cbfdd9443acf8e68 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 12 Dec 2025 12:12:55 -0500
Subject: [PATCH 317/401] minor

---
 .../helper_functions.py                       |  4 +-
 .../sample_matrix_visualization.py            | 81 ++++++++++---------
 2 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index e3c5239..fe7b6e6 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1086,13 +1086,13 @@ def order_rows(A):
     # --- move the class bar (ax_lab) down a bit ---
     fig.canvas.draw()  # ensure positions are current
     lab_box = ax_lab.get_position()  # [x0, y0, width, height] in figure coords
-    down = 0.050  # how much to move down (figure fraction)
+    down = 0.070  # how much to move down (figure fraction)
     new_y0 = max(0.01, lab_box.y0 - down)  # keep it inside the figure
     ax_lab.set_position([lab_box.x0, new_y0, lab_box.width, lab_box.height])
 
     # after you position ax_lab (i.e., after ax_lab.set_position([...]))
     ax_lab.xaxis.set_label_position("top")
-    ax_lab.set_xlabel("sample", labelpad=6, fontweight="bold", fontsize=18)
+    ax_lab.set_xlabel("sample", labelpad=4, fontweight="bold", fontsize=18)
     # keep the strip clean
     ax_lab.tick_params(
         axis="x", which="both", length=0, labelbottom=False, labeltop=False
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 38d7a79..6419b03 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -14,9 +14,6 @@
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
-    ldc_crossvalidated,
-    nearest_neighbor_match,
-    other_class_corr,
     plot_samples_features,
     save_scalar_colorbar,
 )
@@ -71,18 +68,18 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
-    pattern_distinctiveness = {
-        "dFC method": [],
-        "task": [],
-        "NN1_label_match": [],
-        "NN5_label_match": [],
-        "NN10_label_match": [],
-        "other_class_corr_above_90": [],
-        "other_class_corr_above_95": [],
-        "other_class_corr_above_99": [],
-        "other_class_corr_above_999": [],
-        "LDC": [],
-    }
+    # pattern_distinctiveness = {
+    #     "dFC method": [],
+    #     "task": [],
+    #     "NN1_label_match": [],
+    #     "NN5_label_match": [],
+    #     "NN10_label_match": [],
+    #     "other_class_corr_above_90": [],
+    #     "other_class_corr_above_95": [],
+    #     "other_class_corr_above_99": [],
+    #     "other_class_corr_above_999": [],
+    #     "LDC": [],
+    # }
     for dataset in DATASETS:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
         roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
@@ -264,26 +261,32 @@
                     print(f"Skipping task {task} due to embedding error.")
                     continue
 
-                # np.save(f"{output_root}/processed_data/{dataset}_{task}_{measure_name}.npy", DATA[task])
-                NN1_label_match, NN5_label_match, NN10_label_match = (
-                    nearest_neighbor_match(X_train, y_train)
+                if not os.path.exists(f"{output_root}/processed_data"):
+                    os.makedirs(f"{output_root}/processed_data")
+                np.save(
+                    f"{output_root}/processed_data/{dataset}_{task}_{measure_name}.npy",
+                    DATA[task],
                 )
-                above_90, above_95, above_99, above_999 = other_class_corr(
-                    X_train,
-                    y_train,
-                    method="fast",
-                )
-                ldc = ldc_crossvalidated(X_train, y_train, n_splits=4)
-                pattern_distinctiveness["dFC method"].append(measure_name)
-                pattern_distinctiveness["task"].append(task)
-                pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)
-                pattern_distinctiveness["NN5_label_match"].append(NN5_label_match)
-                pattern_distinctiveness["NN10_label_match"].append(NN10_label_match)
-                pattern_distinctiveness["other_class_corr_above_90"].append(above_90)
-                pattern_distinctiveness["other_class_corr_above_95"].append(above_95)
-                pattern_distinctiveness["other_class_corr_above_99"].append(above_99)
-                pattern_distinctiveness["other_class_corr_above_999"].append(above_999)
-                pattern_distinctiveness["LDC"].append(ldc)
+
+                # NN1_label_match, NN5_label_match, NN10_label_match = (
+                #     nearest_neighbor_match(X_train, y_train)
+                # )
+                # above_90, above_95, above_99, above_999 = other_class_corr(
+                #     X_train,
+                #     y_train,
+                #     method="fast",
+                # )
+                # ldc = ldc_crossvalidated(X_train, y_train, n_splits=4)
+                # pattern_distinctiveness["dFC method"].append(measure_name)
+                # pattern_distinctiveness["task"].append(task)
+                # pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)
+                # pattern_distinctiveness["NN5_label_match"].append(NN5_label_match)
+                # pattern_distinctiveness["NN10_label_match"].append(NN10_label_match)
+                # pattern_distinctiveness["other_class_corr_above_90"].append(above_90)
+                # pattern_distinctiveness["other_class_corr_above_95"].append(above_95)
+                # pattern_distinctiveness["other_class_corr_above_99"].append(above_99)
+                # pattern_distinctiveness["other_class_corr_above_999"].append(above_999)
+                # pattern_distinctiveness["LDC"].append(ldc)
 
                 for group, X, y in zip(
                     ["train", "test"], [X_train, X_test], [y_train, y_test]
@@ -359,8 +362,8 @@
                         filename=f"{output_root}/zscore_colorbar.png",
                     )
 
-        # Save pattern distinctiveness results
-        np.save(
-            f"{output_root}/pattern_distinctiveness_{simul_or_real}{raw_or_embedded}.npy",
-            pattern_distinctiveness,
-        )
+        # # Save pattern distinctiveness results
+        # np.save(
+        #     f"{output_root}/pattern_distinctiveness_{simul_or_real}{raw_or_embedded}.npy",
+        #     pattern_distinctiveness,
+        # )

From c7c9ed165f6d577f22ab41b5e3584e535a6f8fd5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 15 Dec 2025 11:20:43 -0500
Subject: [PATCH 318/401] minor

---
 task_dFC/FCS_estimate.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 15d9dff..21623a4 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -59,6 +59,10 @@ def run_FCS_estimate(
         session=session,
         run=run,
     )
+
+    if BOLD is None:
+        print(f"No BOLD data found for task: {task}, session: {session}, run: {run}.")
+        return
     ################################ Measures of dFC #################################
 
     MEASURES_lst, hyper_param_info = multi_analysis_utils.measures_initializer(

From 2151ff9cd489887bd957b684f72a8e744166bd66 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 16 Dec 2025 13:15:29 -0500
Subject: [PATCH 319/401] minor

---
 .../multi_dataset_analysis/sample_matrix_visualization.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 6419b03..a177adf 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -332,7 +332,7 @@
                             show=False,
                         )
 
-                    # C) Label + within-class clustering + t-stat top bar
+                    # C) Label + within-class clustering
                     if group == "train":
                         orders = plot_samples_features(
                             X,
@@ -348,7 +348,7 @@
                             X,
                             y,
                             sample_order="label+cluster",  # clustering is per-split; that’s fine
-                            feature_order="original",  # we still show the t-bar for reference
+                            feature_order="original",
                             col_order_from_train=orders["col_order"],
                             save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
                             show=False,

From 25276e7fb1b92f4d6bf6ba0b71ae3ff229c5007f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 16 Dec 2025 15:42:41 -0500
Subject: [PATCH 320/401] make FCS estimate parallel

---
 task_dFC/FCS_estimate.py                       | 6 +++---
 task_dFC/run_scripts_slurm/methods_config.json | 2 +-
 task_dFC/run_scripts_slurm/run_FCS.sh          | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 21623a4..9ca0ee8 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -11,9 +11,9 @@
 
 warnings.simplefilter("ignore")
 
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["NUMEXPR_NUM_THREADS"] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
 
 ########################################################################################
 
diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index 056646f..07ff8ae 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -11,7 +11,7 @@
         "dhmm_obs_state_ratio": 0.666,
         "n_states": 5,
         "n_subj_clstrs": 10,
-        "n_jobs": 2,
+        "n_jobs": 8,
         "verbose": 0,
         "backend": "loky",
         "normalization": true,
diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index 11c5422..4195b49 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -4,6 +4,7 @@
 #SBATCH --output=logs/fcs_out.txt  # Standard output log
 #SBATCH --error=logs/fcs_err.txt   # Standard error log
 #SBATCH --time=7-00:00:00                # Walltime for each task (7 days)
+#SBATCH --cpus-per-task=8  # Number of CPU cores per task
 #SBATCH --mem=64G                     # Memory request per node
 
 DATASET_INFO="./dataset_info.json"

From b38c74301bc3740ea0526a01c4dcd9210420f741 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 17 Dec 2025 00:05:24 -0500
Subject: [PATCH 321/401] minor fix

---
 task_dFC/FCS_estimate.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 9ca0ee8..99e9073 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -74,8 +74,10 @@ def run_FCS_estimate(
     # to differentiate between the measures
     if len(MEASURES_lst) == 1:
         only_one_measure = True
+        n_jobs = None
     else:
         only_one_measure = False
+        n_jobs = params_multi_analysis["n_jobs"]
 
     if not only_one_measure:
         # we assume only one hyperparameter is altered
@@ -95,7 +97,7 @@ def run_FCS_estimate(
     MEASURES_fit_lst = multi_analysis_utils.estimate_group_FCS(
         time_series=BOLD,
         MEASURES_lst=MEASURES_lst,
-        n_jobs=params_multi_analysis["n_jobs"],
+        n_jobs=n_jobs,
         verbose=params_multi_analysis["verbose"],
         backend=params_multi_analysis["backend"],
     )

From 8dde43604b11273ac68c6bb7f1cb203fb117242b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 19 Dec 2025 12:07:02 -0500
Subject: [PATCH 322/401] minor

---
 task_dFC/FCS_estimate.py                       | 4 ----
 task_dFC/run_scripts_slurm/methods_config.json | 2 +-
 task_dFC/run_scripts_slurm/run_FCS.sh          | 5 +++++
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
index 99e9073..17a45eb 100644
--- a/task_dFC/FCS_estimate.py
+++ b/task_dFC/FCS_estimate.py
@@ -11,10 +11,6 @@
 
 warnings.simplefilter("ignore")
 
-os.environ["MKL_NUM_THREADS"] = "1"
-os.environ["NUMEXPR_NUM_THREADS"] = "1"
-os.environ["OMP_NUM_THREADS"] = "1"
-
 ########################################################################################
 
 
diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index 07ff8ae..056646f 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -11,7 +11,7 @@
         "dhmm_obs_state_ratio": 0.666,
         "n_states": 5,
         "n_subj_clstrs": 10,
-        "n_jobs": 8,
+        "n_jobs": 2,
         "verbose": 0,
         "backend": "loky",
         "normalization": true,
diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index 4195b49..fce086b 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -10,6 +10,11 @@
 DATASET_INFO="./dataset_info.json"
 METHODS_CONFIG="./methods_config.json"
 
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export OPENBLAS_NUM_THREADS=1
+export NUMEXPR_NUM_THREADS=1
+
 # Activate  virtual environment
 source "/home/mt00/venvs/pydfc/bin/activate"
 

From 03b7ac3c39327ed182faa7b0c6ab0e3784186509 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 9 Jan 2026 19:01:30 -0500
Subject: [PATCH 323/401] minor

---
 task_dFC/run_scripts_slurm/methods_config.json | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/methods_config.json b/task_dFC/run_scripts_slurm/methods_config.json
index 056646f..722b4ff 100644
--- a/task_dFC/run_scripts_slurm/methods_config.json
+++ b/task_dFC/run_scripts_slurm/methods_config.json
@@ -11,9 +11,13 @@
         "dhmm_obs_state_ratio": 0.666,
         "n_states": 5,
         "n_subj_clstrs": 10,
-        "n_jobs": 2,
         "verbose": 0,
-        "backend": "loky",
+        "n_jobs_sw": 8,
+        "backend_sw": "threading",
+        "n_jobs_tf": 2,
+        "backend_tf": "loky",
+        "n_jobs_swc": null,
+        "backend_swc": null,
         "normalization": true,
         "num_subj": null,
         "num_time_point": null

From dbaf4b44f0d4ae99370abb2ea5ae6ace675ab821 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 16 Jan 2026 16:33:13 -0500
Subject: [PATCH 324/401] minor cleaning

---
 .../helper_functions.py                       | 325 ------------------
 .../sample_matrix_visualization.py            |  38 --
 2 files changed, 363 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index fe7b6e6..242cc00 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1174,328 +1174,3 @@ def save_scalar_colorbar(
 
     fig.savefig(filename, bbox_inches="tight", pad_inches=0.02)
     plt.close(fig)
-
-
-def nearest_neighbor_match(X, y):
-    """
-    Compute fraction of matching labels for k=1,5,10 nearest neighbors.
-    For k>1, this returns fraction of *all neighbor votes* that match the label.
-    """
-    X = np.asarray(X)
-    y = np.asarray(y)
-
-    # Fit once with max k
-    max_k = 10
-    nbrs = NearestNeighbors(
-        n_neighbors=max_k + 1,
-        metric="correlation",
-        algorithm="auto",
-    ).fit(X)
-
-    # Compute neighbors for all samples once
-    indices = nbrs.kneighbors(X, return_distance=False)[:, 1:]  # drop self
-
-    # Labels of all neighbors: shape (N, 10)
-    neighbor_labels = y[indices]
-
-    # Expand y to shape (N,1) for vectorized comparison
-    y_col = y.reshape(-1, 1)
-
-    # Boolean match matrix: (N,10)
-    match_matrix = neighbor_labels == y_col
-
-    # Compute metrics
-    match_1 = np.mean(match_matrix[:, :1])  # first neighbor
-    match_5 = np.mean(match_matrix[:, :5])  # first 5 neighbors
-    match_10 = np.mean(match_matrix[:, :10])  # all 10 neighbors
-
-    return match_1, match_5, match_10
-
-
-# def other_class_max_corr(X, y, method="fast", metric="cosine"):
-#     """
-#     Compute max cross-class similarity (Pearson or cosine) per sample and summary stats.
-
-#     Parameters
-#     ----------
-#     X : array, shape (n_samples, n_features)
-#     y : array, shape (n_samples,)
-#     method : "slow" or "fast"
-#         slow  = loop-based (OOM-safe but slow)
-#         fast  = matrix-based (very fast but uses O(N^2) memory)
-#     metric : "pearson" or "cosine"
-#         pearson = correlation after mean-centering
-#         cosine  = correlation without mean-centering
-
-#     Returns
-#     -------
-#     median, fraction_above_0.9, 95th_percentile, fraction_z_gt_1.645
-#     """
-#     X = np.asarray(X, dtype=float)
-#     y = np.asarray(y)
-
-#     # ======================================================
-#     # Helper: compute normalized X depending on metric
-#     # ======================================================
-#     def normalize(X):
-#         if metric == "pearson":
-#             Xn = X - X.mean(axis=1, keepdims=True)
-#         elif metric == "cosine":
-#             Xn = X.copy()
-#         else:
-#             raise ValueError("metric must be 'pearson' or 'cosine'")
-
-#         norms = np.linalg.norm(Xn, axis=1, keepdims=True) + 1e-12
-#         return Xn / norms
-
-#     # ======================================================
-#     # SLOW METHOD (loop-based, safe for large N)
-#     # ======================================================
-#     if method == "slow":
-#         all_corrs = []
-#         for i, sample in enumerate(X):
-#             class_label = y[i]
-#             other_class_label = 1 - class_label
-
-#             # extract opposite-class samples
-#             X_other = X[y == other_class_label]
-
-#             # normalize sample and opposite-class samples based on metric
-#             s = sample.reshape(1, -1)
-#             s_norm = normalize(s)[0]
-#             X_other_norm = normalize(X_other)
-
-#             # dot products = cosine or Pearson similarity
-#             corrs = X_other_norm @ s_norm
-
-#             all_corrs.append(np.max(corrs))
-
-#         all_corrs = np.asarray(all_corrs)
-
-#     # ======================================================
-#     # FAST METHOD (matrix multiplication)
-#     # ======================================================
-#     elif method == "fast":
-#         X_norm = normalize(X)  # normalize entire matrix according to metric
-
-#         # similarity matrix
-#         sim_matrix = X_norm @ X_norm.T  # (N × N)
-
-#         # mask for opposite-class pairs
-#         y = y.astype(int)
-#         other_mask = y[:, None] != y[None, :]
-
-#         # mask same-class and diagonal
-#         cross_sims = np.where(other_mask, sim_matrix, -np.inf)
-
-#         # max similarity to opposite class
-#         all_corrs = np.max(cross_sims, axis=1)
-
-#     else:
-#         raise ValueError("method must be 'slow' or 'fast'")
-
-#     # ======================================================
-#     # Summary statistics
-#     # ======================================================
-#     median = np.median(all_corrs)
-#     above_90 = np.mean(all_corrs > 0.9)
-#     percentile_95 = np.percentile(all_corrs, 95)
-
-#     mean_corr = all_corrs.mean()
-#     std_corr = all_corrs.std(ddof=1)
-#     z = (all_corrs - mean_corr) / (std_corr + 1e-12)
-#     high_frac = np.mean(z > 1.645) * 100
-
-#     return median, above_90, percentile_95, high_frac
-
-
-def other_class_corr(X, y, method="fast", metric="cosine"):
-    """
-    Compute cross-class similarity (Pearson or cosine) for ALL opposite-class pairs
-    (each pair counted only once) and summarize their distribution.
-
-    Parameters
-    ----------
-    X : array, shape (n_samples, n_features)
-    y : array, shape (n_samples,)
-    method : "slow" or "fast"
-        slow  = loop-based (avoids full N×N matrix, but slower)
-        fast  = matrix-based (computes class0×class1 block)
-    metric : "pearson" or "cosine"
-        pearson = correlation after mean-centering each row
-        cosine  = cosine similarity without mean-centering
-
-    Returns
-    -------
-    fraction_above_0.9, fraction_above_0.95, fraction_above_0.99, fraction_above_0.999
-    0.0–1.0 fractions over ALL unique cross-class similarities.
-    """
-    X = np.asarray(X, dtype=float)
-    y = np.asarray(y)
-
-    # ------------------------------------------------------
-    # Helper: row-wise normalization
-    # ------------------------------------------------------
-    def normalize_rows(X_):
-        if metric == "pearson":
-            Xn = X_ - X_.mean(axis=1, keepdims=True)
-        elif metric == "cosine":
-            Xn = X_.copy()
-        else:
-            raise ValueError("metric must be 'pearson' or 'cosine'")
-
-        norms = np.linalg.norm(Xn, axis=1, keepdims=True) + 1e-12
-        return Xn / norms
-
-    # Binary labels assumed: 0/1 (or two unique values)
-    unique_labels = np.unique(y)
-    if unique_labels.size != 2:
-        raise ValueError("This function assumes exactly two classes.")
-
-    label_a, label_b = unique_labels
-
-    idx_a = np.where(y == label_a)[0]
-    idx_b = np.where(y == label_b)[0]
-
-    # ------------------------------------------------------
-    # SLOW METHOD (loop over one class only)
-    # ------------------------------------------------------
-    if method == "slow":
-        all_corrs = []
-
-        X_b = X[idx_b]
-        X_b_norm = normalize_rows(X_b)
-
-        for i in idx_a:
-            s = X[i].reshape(1, -1)
-            s_norm = normalize_rows(s)[0]
-
-            # similarities between this sample and ALL samples in other class
-            corrs = X_b_norm @ s_norm  # shape: (n_b,)
-
-            all_corrs.extend(corrs.tolist())
-
-        all_corrs = np.asarray(all_corrs)
-
-    # ------------------------------------------------------
-    # FAST METHOD (block matrix classA × classB)
-    # ------------------------------------------------------
-    elif method == "fast":
-        X_norm = normalize_rows(X)
-
-        Xa = X_norm[idx_a]  # (n_a, F)
-        Xb = X_norm[idx_b]  # (n_b, F)
-
-        # similarities for all unique cross-class pairs (label_a vs label_b)
-        # shape: (n_a, n_b)
-        sim_block = Xa @ Xb.T
-
-        # flatten: each pair counted exactly once
-        all_corrs = sim_block.ravel()
-
-    else:
-        raise ValueError("method must be 'slow' or 'fast'")
-
-    # ------------------------------------------------------
-    # Summary statistics over ALL unique cross-class similarities
-    # ------------------------------------------------------
-    above_90 = np.mean(all_corrs > 0.9)
-    above_95 = np.mean(all_corrs > 0.95)
-    above_99 = np.mean(all_corrs > 0.99)
-    above_999 = np.mean(all_corrs > 0.999)
-
-    return above_90, above_95, above_99, above_999
-
-
-from sklearn.covariance import LedoitWolf
-from sklearn.model_selection import StratifiedKFold
-
-
-def ldc_crossvalidated(X, y, n_splits=2, random_state=None):
-    """
-    Cross-validated Mahalanobis distance (LDC) between two classes.
-
-    Parameters
-    ----------
-    X : array-like, shape (n_samples, n_features)
-        Data matrix. Rows are samples, columns are features.
-    y : array-like, shape (n_samples,)
-        Class labels. Must contain exactly TWO unique labels.
-    n_splits : int, default=2
-        Number of CV splits (partitions). 2 is the classic LDC setup.
-    random_state : int or None
-        Seed for reproducible splits.
-
-    Returns
-    -------
-    ldc : float
-        Cross-validated Mahalanobis distance (LDC).
-        Unbiased estimate of squared distance between class means
-        in noise-whitened space (optionally divided by n_features).
-    pairwise_ldcs : np.ndarray, shape (n_pairs,)
-        LDC values for each fold-pair used in the averaging.
-    """
-    X = np.asarray(X, dtype=float)
-    y = np.asarray(y)
-
-    # ----- 1) Check binary labels -----
-    unique_labels = np.unique(y)
-    if unique_labels.size != 2:
-        raise ValueError("ldc_crossvalidated currently assumes exactly two classes.")
-    label_a, label_b = unique_labels
-
-    n_samples, n_features = X.shape
-
-    # ----- 2) Estimate noise covariance (within-class) -----
-    # Subtract class means to get residuals, then estimate covariance on residuals.
-    X_resid = X.copy()
-    for lbl in unique_labels:
-        mask = y == lbl
-        X_resid[mask] -= X_resid[mask].mean(axis=0, keepdims=True)
-
-    # Shrinkage covariance for stability in high-dim / low-n regimes
-    lw = LedoitWolf().fit(X_resid)
-    Sigma = lw.covariance_
-
-    # ----- 3) Compute whitening transform W such that Sigma^{-1} = W^T W -----
-    # Using Cholesky: Sigma = L L^T  =>  Sigma^{-1} = L^{-T} L^{-1}
-    # Choose W = L^{-1}; then W^T W = Sigma^{-1}.
-    L = np.linalg.cholesky(Sigma)
-    # Solve L * M = I for M, then W = M
-    W = np.linalg.inv(L)  # (n_features x n_features)
-
-    # Whitened data
-    Xw = X @ W  # each row is whitened pattern
-
-    # ----- 4) Cross-validation splits -----
-    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
-
-    deltas = []  # mean-difference vectors per fold in whitened space
-
-    for train_idx, _ in skf.split(Xw, y):
-        X_fold = Xw[train_idx]
-        y_fold = y[train_idx]
-
-        # class means in this fold
-        mu_a = X_fold[y_fold == label_a].mean(axis=0)
-        mu_b = X_fold[y_fold == label_b].mean(axis=0)
-
-        delta = mu_a - mu_b  # difference of class means
-        deltas.append(delta)
-
-    deltas = np.vstack(deltas)  # shape (n_splits, n_features)
-
-    # ----- 5) Cross-validated Mahalanobis distance (LDC) -----
-    # For each pair of independent partitions f != g:
-    #   LDC_fg = delta_f^T delta_g / n_features
-    # Average over all unique pairs.
-    pairwise_ldcs = []
-    for i in range(len(deltas)):
-        for j in range(i + 1, len(deltas)):
-            ldc_ij = np.dot(deltas[i], deltas[j]) / n_features
-            pairwise_ldcs.append(ldc_ij)
-
-    pairwise_ldcs = np.asarray(pairwise_ldcs)
-    ldc = pairwise_ldcs.mean()
-
-    return ldc
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index a177adf..3865664 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -68,18 +68,6 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
-    # pattern_distinctiveness = {
-    #     "dFC method": [],
-    #     "task": [],
-    #     "NN1_label_match": [],
-    #     "NN5_label_match": [],
-    #     "NN10_label_match": [],
-    #     "other_class_corr_above_90": [],
-    #     "other_class_corr_above_95": [],
-    #     "other_class_corr_above_99": [],
-    #     "other_class_corr_above_999": [],
-    #     "LDC": [],
-    # }
     for dataset in DATASETS:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
         roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
@@ -268,26 +256,6 @@
                     DATA[task],
                 )
 
-                # NN1_label_match, NN5_label_match, NN10_label_match = (
-                #     nearest_neighbor_match(X_train, y_train)
-                # )
-                # above_90, above_95, above_99, above_999 = other_class_corr(
-                #     X_train,
-                #     y_train,
-                #     method="fast",
-                # )
-                # ldc = ldc_crossvalidated(X_train, y_train, n_splits=4)
-                # pattern_distinctiveness["dFC method"].append(measure_name)
-                # pattern_distinctiveness["task"].append(task)
-                # pattern_distinctiveness["NN1_label_match"].append(NN1_label_match)
-                # pattern_distinctiveness["NN5_label_match"].append(NN5_label_match)
-                # pattern_distinctiveness["NN10_label_match"].append(NN10_label_match)
-                # pattern_distinctiveness["other_class_corr_above_90"].append(above_90)
-                # pattern_distinctiveness["other_class_corr_above_95"].append(above_95)
-                # pattern_distinctiveness["other_class_corr_above_99"].append(above_99)
-                # pattern_distinctiveness["other_class_corr_above_999"].append(above_999)
-                # pattern_distinctiveness["LDC"].append(ldc)
-
                 for group, X, y in zip(
                     ["train", "test"], [X_train, X_test], [y_train, y_test]
                 ):
@@ -361,9 +329,3 @@
                         label="z-scored feature value",
                         filename=f"{output_root}/zscore_colorbar.png",
                     )
-
-        # # Save pattern distinctiveness results
-        # np.save(
-        #     f"{output_root}/pattern_distinctiveness_{simul_or_real}{raw_or_embedded}.npy",
-        #     pattern_distinctiveness,
-        # )

From 3e39e30a32b9bc32b9aa346c5094ef2d7a7ab828 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 20 Jan 2026 16:08:33 -0500
Subject: [PATCH 325/401] add denoising_strategy as an arg for nifti_to_roi

---
 task_dFC/nifti_to_roi_signal.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
index e91eed1..b9dd9a7 100644
--- a/task_dFC/nifti_to_roi_signal.py
+++ b/task_dFC/nifti_to_roi_signal.py
@@ -22,6 +22,7 @@ def run_roi_signal_extraction(
     RUNS=[None],
     trial_type_label="trial_type",
     rest_labels=[],
+    denoising_strategy="simple",
 ):
     """
     Extract ROI signals and task labels for a given subject and task
@@ -138,7 +139,7 @@ def run_roi_signal_extraction(
             n_rois=100,
             Fs=1 / TR_mri,
             subj_id=subj,
-            confound_strategy="simple",
+            confound_strategy=denoising_strategy,
             standardize="zscore",
             TS_name="BOLD",
             session=task,
@@ -254,11 +255,15 @@ def run_roi_signal_extraction(
 
     parser.add_argument("--dataset_info", type=str, help="path to dataset info file")
     parser.add_argument("--participant_id", type=str, help="participant id")
+    parser.add_argument(
+        "--denoising_strategy", type=str, default="simple", help="denoising strategy"
+    )
 
     args = parser.parse_args()
 
     dataset_info_file = args.dataset_info
     participant_id = args.participant_id
+    denoising_strategy = args.denoising_strategy
 
     # Read dataset info
     with open(dataset_info_file, "r") as f:
@@ -330,6 +335,7 @@ def run_roi_signal_extraction(
                 RUNS=RUNS[task],
                 trial_type_label=trial_type_label[task],
                 rest_labels=rest_labels[task],
+                denoising_strategy=denoising_strategy,
             )
 
     print(

From 320adc172014b58fe16acda3358a915afccb7a31 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 20 Jan 2026 16:17:48 -0500
Subject: [PATCH 326/401] add DENOISING_STRATEGY to run_nifti_to_roi.sh

---
 .../run_scripts_slurm/run_nifti_to_roi.sh     | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 8a4d1d9..6e3c789 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -1,24 +1,32 @@
 #!/bin/sh
 #
-#SBATCH --job-name=extract_roi_job   # Optional: Name of your job
-#SBATCH --output=logs/roi_out.txt  # Standard output log
-#SBATCH --error=logs/roi_err.txt   # Standard error log
-#SBATCH --time=24:00:00                # Walltime for each task (24 hours)
-#SBATCH --mem=64G                     # Memory request per node
+#SBATCH --job-name=extract_roi_job
+#SBATCH --output=logs/roi_out.txt
+#SBATCH --error=logs/roi_err.txt
+#SBATCH --time=24:00:00
+#SBATCH --mem=64G
 
+# -----------------------------
+# Inputs
+# -----------------------------
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
+DENOISING_STRATEGY=${1:-simple}
 
-echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+echo "Denoising strategy: $DENOISING_STRATEGY"
+echo "Number of subjects: $(wc -l < "$SUBJECT_LIST")"
 
-SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST`
+SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" "$SUBJECT_LIST")
 echo "Subject ID: $SUBJECT_ID"
 
-# Activate  virtual environment
+# -----------------------------
+# Environment
+# -----------------------------
 source "/home/mt00/venvs/pydfc/bin/activate"
 
 python "/home/mt00/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
---dataset_info $DATASET_INFO \
---participant_id $SUBJECT_ID
+    --dataset_info $DATASET_INFO \
+    --participant_id $SUBJECT_ID \
+    --denoising_strategy $DENOISING_STRATEGY
 
 deactivate

From e38f8bde549912ba1d43d77c9e57cba938b5630d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 24 Jan 2026 13:35:49 -0500
Subject: [PATCH 327/401] add PLS

---
 pydfc/ml_utils.py | 162 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 158 insertions(+), 4 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 3beedaf..593cd44 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -12,6 +12,7 @@
 from scipy.spatial import procrustes
 from sklearn.base import clone
 from sklearn.cluster import KMeans
+from sklearn.cross_decomposition import PLSRegression
 from sklearn.decomposition import PCA
 from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
 from sklearn.linear_model import LinearRegression, LogisticRegression
@@ -961,6 +962,137 @@ def rows_look_redundant(X, sample=100):
     return (len(h) - len(set(h))) / len(h) > 0.5
 
 
+class PLSBinaryEmbedder:
+    """
+    Supervised dimensionality reduction using PLSRegression for binary labels.
+    Produces low-dim 'scores' features for downstream classifiers.
+
+    Usage:
+        pls = PLSBinaryEmbedder(n_components=10)
+        Z_train = pls.fit_transform(X_train, y_train)
+        Z_test  = pls.transform(X_test)
+    """
+
+    def __init__(self, n_components=10, scale=True):
+        self.n_components = int(n_components)
+        self.scale = bool(scale)
+
+        self.scaler_ = None
+        self.model_ = None
+
+    def fit(self, X, y):
+        X = np.asarray(X)
+        y = np.asarray(y).ravel()
+
+        # enforce 0/1
+        y01 = (y > 0).astype(float).reshape(-1, 1)
+
+        if self.scale:
+            self.scaler_ = StandardScaler(with_mean=True, with_std=True)
+            Xs = self.scaler_.fit_transform(X)
+        else:
+            Xs = X
+
+        self.model_ = PLSRegression(n_components=self.n_components, scale=False)
+        self.model_.fit(Xs, y01)
+        return self
+
+    def fit_transform(self, X, y):
+        self.fit(X, y)
+        return self.transform(X)
+
+    def transform(self, X):
+        if self.model_ is None:
+            raise RuntimeError("PLSBinaryEmbedder is not fitted yet.")
+        X = np.asarray(X)
+
+        Xs = self.scaler_.transform(X) if self.scale else X
+
+        # PLS scores (latent components)
+        # sklearn exposes x_scores_ only for training; for new data:
+        Z = Xs @ self.model_.x_rotations_  # (n_samples, n_components)
+        return Z.astype(np.float32, copy=False)
+
+
+def subject_center(X, subj_labels, mode="zscore"):
+    Xc = np.zeros_like(X)
+    for subj in np.unique(subj_labels):
+        idx = subj_labels == subj
+        if mode == "demean":
+            Xc[idx] = X[idx] - X[idx].mean(axis=0, keepdims=True)
+        elif mode == "zscore":
+            mu = X[idx].mean(axis=0, keepdims=True)
+            sd = X[idx].std(axis=0, keepdims=True) + 1e-6
+            Xc[idx] = (X[idx] - mu) / sd
+    return Xc
+
+
+def select_pls_components_binary_groupcv(
+    X,
+    y,
+    groups,
+    n_list=(2, 5, 10, 15, 20),
+    cv=3,
+    random_state=0,
+):
+    """
+    Select number of PLS components using subject-aware CV.
+
+    Parameters
+    ----------
+    X : array (n_samples, n_features)
+    y : array (n_samples,) binary labels
+    groups : array (n_samples,) subject IDs
+    n_list : iterable of candidate n_components
+    cv : number of folds
+    random_state : int
+
+    Returns
+    -------
+    best_n : int
+        Selected number of PLS components
+    best_score : float
+        Mean CV balanced accuracy
+    """
+
+    X = np.asarray(X)
+    y = np.asarray(y).ravel()
+    groups = np.asarray(groups)
+
+    cv_splitter = StratifiedGroupKFold(
+        n_splits=cv, shuffle=True, random_state=random_state
+    )
+
+    best_n, best_score = None, -np.inf
+
+    for n in n_list:
+        fold_scores = []
+
+        for tr, va in cv_splitter.split(X, y, groups):
+            # ---- PLS embedding (trained ONLY on train fold subjects)
+            emb = PLSBinaryEmbedder(n_components=n, scale=True)
+            Ztr = emb.fit_transform(X[tr], y[tr])
+            Zva = emb.transform(X[va])
+
+            # ---- classifier in latent space
+            clf = make_pipeline(
+                StandardScaler(),
+                SVC(kernel="rbf", C=1.0, gamma="scale"),
+            )
+            clf.fit(Ztr, y[tr])
+            pred = clf.predict(Zva)
+
+            fold_scores.append(balanced_accuracy_score(y[va], pred))
+
+        mean_score = float(np.mean(fold_scores))
+
+        if mean_score > best_score:
+            best_score = mean_score
+            best_n = n
+
+    return best_n, best_score
+
+
 def embed_dFC_features(
     train_subjects,
     test_subjects,
@@ -977,7 +1109,7 @@ def embed_dFC_features(
     measure_is_state_based=False,
 ):
     """
-    Embed the dFC features into a lower dimensional space using PCA or LE. For LE, it assumes that the samples of the same subject are contiguous.
+    Embed the dFC features into a lower dimensional space using PCA,  or PLS. For PLS, it assumes that the samples of the same subject are contiguous.
 
     for LE, first the LE is applied on each subj separately and then the procrustes transformation is applied to align the embeddings of different subjects.
     All the subjects are transformed into the space of the subject with the highest silhouette score.
@@ -1002,6 +1134,26 @@ def embed_dFC_features(
             X_test_embed = pca.transform(X_test)
         else:
             X_test_embed = None
+    elif embedding == "PLS":
+        # center the data by subject before PLS to remove subject effects
+        X_train_c = subject_center(X_train, subj_label_train, mode="zscore")
+        X_test_c = subject_center(X_test, subj_label_test, mode="zscore")
+        # if n_components is not specified, select it using subject-aware CV on the training set
+        if n_components == "auto":
+            best_n, _ = select_pls_components_binary_groupcv(
+                X_train_c,
+                y_train,
+                subj_label_train,
+                n_list=range(10, 60, 10),  # you can adjust this range based on your data
+                cv=5,  # more stable
+            )
+            n_components = best_n
+
+        pls = PLSBinaryEmbedder(n_components=n_components, scale=True)
+        # fit on train set
+        X_train_embed = pls.fit_transform(X_train_c, y_train)
+        # only transform test set
+        X_test_embed = pls.transform(X_test_c)
     elif embedding == "LE":
         # if the dFC features are not unique (state-based), set the LE_embedding_method to "concat+embed"
         if measure_is_state_based:
@@ -1064,6 +1216,8 @@ def embed_dFC_features(
                 X_test_embed = X_concat_embed[X_train.shape[0] :, :]
             else:
                 X_test_embed = None
+    else:
+        raise ValueError(f"Unknown embedding method: {embedding}")
 
     # to make computation faster, we can return the embeddings as float32
     X_train_embed = X_train_embed.astype(np.float32, copy=False)
@@ -1611,7 +1765,7 @@ def task_presence_classification(
 
     check_count = 2
     num_excluded_subjects = 0
-    for embedding in ["PCA", "LE"]:
+    for embedding in ["PCA", "PLS"]:
         if measure_is_state_based:
             X_train_embedded = process_SB_features(X=X_train, measure_name=measure_name)
             X_test_embedded = process_SB_features(X=X_test, measure_name=measure_name)
@@ -1852,7 +2006,7 @@ def task_presence_clustering(
         FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
     )
 
-    clustering_RESULTS = {"PCA": {}, "LE": {}}
+    clustering_RESULTS = {"PCA": {}, "PLS": {}}
     clustering_scores = {
         "subj_id": list(),
         "task": list(),
@@ -1862,7 +2016,7 @@ def task_presence_clustering(
         "SI": list(),
         "embedding": list(),
     }
-    for embedding in ["PCA", "LE"]:
+    for embedding in ["PCA", "PLS"]:
         # embed dFC features
         # if the number of features is smaller than 25, we assume that dimensionality reduction is not needed
         # specially for state-based dFC features, the number of features is equal to the number of states

From b1255c85d4e2419b0a11f5299197db774221e09c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 24 Jan 2026 23:23:15 -0500
Subject: [PATCH 328/401] minor

---
 pydfc/ml_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 593cd44..7c01df8 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -982,10 +982,15 @@ def __init__(self, n_components=10, scale=True):
 
     def fit(self, X, y):
         X = np.asarray(X)
-        y = np.asarray(y).ravel()
+        y = np.asarray(y)
 
-        # enforce 0/1
-        y01 = (y > 0).astype(float).reshape(-1, 1)
+        if y.ndim == 1:
+            y = y.reshape(-1, 1)
+        elif y.ndim == 2:
+            if y.shape[0] != X.shape[0]:
+                raise ValueError(f"y has shape {y.shape} but X has shape {X.shape}.")
+        else:
+            raise ValueError("y must be 1D or 2D.")
 
         if self.scale:
             self.scaler_ = StandardScaler(with_mean=True, with_std=True)
@@ -994,7 +999,7 @@ def fit(self, X, y):
             Xs = X
 
         self.model_ = PLSRegression(n_components=self.n_components, scale=False)
-        self.model_.fit(Xs, y01)
+        self.model_.fit(Xs, y)
         return self
 
     def fit_transform(self, X, y):

From 96eb97930098f438b1217a3af767e1189527c64a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 24 Jan 2026 23:55:39 -0500
Subject: [PATCH 329/401] add contunous y support in embedding

---
 pydfc/ml_utils.py | 121 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 107 insertions(+), 14 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 7c01df8..e3bc4ac 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -24,15 +24,22 @@
     balanced_accuracy_score,
     confusion_matrix,
     f1_score,
+    mean_squared_error,
     precision_score,
+    r2_score,
     recall_score,
     silhouette_score,
 )
-from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold, StratifiedKFold
+from sklearn.model_selection import (
+    GridSearchCV,
+    GroupKFold,
+    StratifiedGroupKFold,
+    StratifiedKFold,
+)
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
-from sklearn.svm import SVC
+from sklearn.svm import SVC, SVR
 from sklearn.utils import shuffle
 
 from .dfc_utils import dFC_mat2vec, dFC_vec2mat, rank_norm
@@ -962,13 +969,13 @@ def rows_look_redundant(X, sample=100):
     return (len(h) - len(set(h))) / len(h) > 0.5
 
 
-class PLSBinaryEmbedder:
+class PLSEmbedder:
     """
     Supervised dimensionality reduction using PLSRegression for binary labels.
     Produces low-dim 'scores' features for downstream classifiers.
 
     Usage:
-        pls = PLSBinaryEmbedder(n_components=10)
+        pls = PLSEmbedder(n_components=10)
         Z_train = pls.fit_transform(X_train, y_train)
         Z_test  = pls.transform(X_test)
     """
@@ -1008,7 +1015,7 @@ def fit_transform(self, X, y):
 
     def transform(self, X):
         if self.model_ is None:
-            raise RuntimeError("PLSBinaryEmbedder is not fitted yet.")
+            raise RuntimeError("PLSEmbedder is not fitted yet.")
         X = np.asarray(X)
 
         Xs = self.scaler_.transform(X) if self.scale else X
@@ -1075,7 +1082,7 @@ def select_pls_components_binary_groupcv(
 
         for tr, va in cv_splitter.split(X, y, groups):
             # ---- PLS embedding (trained ONLY on train fold subjects)
-            emb = PLSBinaryEmbedder(n_components=n, scale=True)
+            emb = PLSEmbedder(n_components=n, scale=True)
             Ztr = emb.fit_transform(X[tr], y[tr])
             Zva = emb.transform(X[va])
 
@@ -1098,6 +1105,77 @@ def select_pls_components_binary_groupcv(
     return best_n, best_score
 
 
+def select_pls_components_continuous_groupcv(
+    X,
+    y,
+    groups,
+    n_list=(2, 5, 10, 15, 20),
+    cv=3,
+    score="r2",  # "r2" or "neg_mse"
+):
+    """
+    Select number of PLS components using subject-aware CV for a CONTINUOUS target.
+
+    Parameters
+    ----------
+    X : array (n_samples, n_features)
+    y : array (n_samples,) continuous target
+    groups : array (n_samples,) subject IDs
+    n_list : iterable of candidate n_components
+    cv : number of folds
+    score : "r2" or "neg_mse"
+
+    Returns
+    -------
+    best_n : int
+        Selected number of PLS components
+    best_score : float
+        Mean CV score (higher is better)
+        - R² if score="r2"
+        - negative MSE if score="neg_mse"
+    """
+
+    X = np.asarray(X)
+    y = np.asarray(y).ravel()  # regression target must be 1D for SVR
+    groups = np.asarray(groups)
+
+    if score not in ("r2", "neg_mse"):
+        raise ValueError("score must be 'r2' or 'neg_mse'.")
+
+    cv_splitter = GroupKFold(n_splits=cv)
+
+    best_n, best_score = None, -np.inf
+
+    for n in n_list:
+        fold_scores = []
+
+        for tr, va in cv_splitter.split(X, y, groups):
+            # ---- PLS embedding (trained ONLY on train fold subjects)
+            emb = PLSEmbedder(n_components=n, scale=True)
+            # PLSRegression expects y 2D
+            Ztr = emb.fit_transform(X[tr], y[tr].reshape(-1, 1))
+            Zva = emb.transform(X[va])
+
+            # ---- regressor in latent space
+            reg = make_pipeline(
+                StandardScaler(),
+                SVR(kernel="rbf", C=1.0, gamma="scale"),
+            )
+            reg.fit(Ztr, y[tr])
+            pred = reg.predict(Zva)
+
+            if score == "r2":
+                fold_scores.append(r2_score(y[va], pred))
+            else:
+                fold_scores.append(-mean_squared_error(y[va], pred))
+
+        mean_score = float(np.mean(fold_scores))
+        if mean_score > best_score:
+            best_score, best_n = mean_score, n
+
+    return best_n, best_score
+
+
 def embed_dFC_features(
     train_subjects,
     test_subjects,
@@ -1112,6 +1190,7 @@ def embed_dFC_features(
     n_neighbors_LE=125,
     LE_embedding_method="embed+procrustes",
     measure_is_state_based=False,
+    y_continuous=False,
 ):
     """
     Embed the dFC features into a lower dimensional space using PCA,  or PLS. For PLS, it assumes that the samples of the same subject are contiguous.
@@ -1145,16 +1224,30 @@ def embed_dFC_features(
         X_test_c = subject_center(X_test, subj_label_test, mode="zscore")
         # if n_components is not specified, select it using subject-aware CV on the training set
         if n_components == "auto":
-            best_n, _ = select_pls_components_binary_groupcv(
-                X_train_c,
-                y_train,
-                subj_label_train,
-                n_list=range(10, 60, 10),  # you can adjust this range based on your data
-                cv=5,  # more stable
-            )
+            if y_continuous:
+                best_n, _ = select_pls_components_continuous_groupcv(
+                    X=X_train_c,
+                    y=y_train,
+                    groups=subj_label_train,
+                    n_list=range(
+                        10, 60, 10
+                    ),  # you can adjust this range based on your data
+                    cv=5,  # more stable
+                    score="r2",
+                )
+            else:
+                best_n, _ = select_pls_components_binary_groupcv(
+                    X=X_train_c,
+                    y=y_train,
+                    groups=subj_label_train,
+                    n_list=range(
+                        10, 60, 10
+                    ),  # you can adjust this range based on your data
+                    cv=5,  # more stable
+                )
             n_components = best_n
 
-        pls = PLSBinaryEmbedder(n_components=n_components, scale=True)
+        pls = PLSEmbedder(n_components=n_components, scale=True)
         # fit on train set
         X_train_embed = pls.fit_transform(X_train_c, y_train)
         # only transform test set

From 5e9426a2ce5e0cb3a8fa0ef24975aa26e85bdefb Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 28 Jan 2026 09:54:00 -0500
Subject: [PATCH 330/401] add real for cohensd.py

---
 task_dFC/multi_dataset_analysis/cohensd.py    | 182 +++++++++---------
 .../run_across_dataset_analysis.sh            |   5 +-
 2 files changed, 97 insertions(+), 90 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index 34ea7cb..f589a2c 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -26,19 +26,28 @@
     parser.add_argument(
         "--multi_dataset_info", type=str, help="path to multi-dataset info file"
     )
+    parser.add_argument(
+        "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
+    )
 
     args = parser.parse_args()
 
     multi_dataset_info = args.multi_dataset_info
+    simul_or_real = args.simul_or_real
 
     # Read dataset info
     with open(multi_dataset_info, "r") as f:
         multi_dataset_info = json.load(f)
 
-    main_root = multi_dataset_info["real_data"]["main_root"]
-    DATASETS = multi_dataset_info["real_data"]["DATASETS"]
-    TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
-    output_root = f"{multi_dataset_info['output_root']}/CohensD"
+    if simul_or_real == "real":
+        main_root = multi_dataset_info["real_data"]["main_root"]
+        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+    elif simul_or_real == "simulated":
+        main_root = multi_dataset_info["simulated_data"]["main_root"]
+        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
+        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+    output_root = f"{multi_dataset_info['output_root']}/CohensD/{simul_or_real}"
 
     if not os.path.exists(output_root):
         os.makedirs(output_root)
@@ -175,95 +184,96 @@
             CohensD_across_task["ROI"].extend(BOLD.node_labels)
 
             # plot d values on a glass brain
-            coords = BOLD.locs
+            if simul_or_real == "real":
+                coords = BOLD.locs
+
+                template_img = datasets.load_mni152_template()
+                data = np.zeros(template_img.shape)
+                affine = template_img.affine
+
+                # Create a small sphere for each coordinate
+                radius = 5  # in voxels
+                for c, d in zip(coords, avg_d_values):
+                    ijk = np.round(
+                        nib.affines.apply_affine(np.linalg.inv(affine), c)
+                    ).astype(int)
+                    x, y, z = ijk
+                    for i in range(-radius, radius + 1):
+                        for j in range(-radius, radius + 1):
+                            for k in range(-radius, radius + 1):
+                                if i**2 + j**2 + k**2 <= radius**2:
+                                    xi, yj, zk = x + i, y + j, z + k
+                                    if (
+                                        (0 <= xi < data.shape[0])
+                                        and (0 <= yj < data.shape[1])
+                                        and (0 <= zk < data.shape[2])
+                                    ):
+                                        data[xi, yj, zk] = d
+
+                d_img = nib.Nifti1Image(data, affine)
+
+                plotting.plot_glass_brain(
+                    d_img,
+                    display_mode="ortho",
+                    colorbar=True,
+                    plot_abs=False,
+                    cmap="coolwarm",
+                    vmax=np.max(avg_d_values),
+                )
+
+                plt.savefig(
+                    f"{output_root}/cohensd_region_{task}.png",
+                    dpi=120,
+                    bbox_inches="tight",
+                    pad_inches=0.1,
+                    format="png",
+                )
+
+                plt.close()
+
+                # Load Schaefer atlas (100 parcels)
+                schaefer = datasets.fetch_atlas_schaefer_2018(n_rois=100)
 
-            template_img = datasets.load_mni152_template()
-            data = np.zeros(template_img.shape)
-            affine = template_img.affine
+                # atlas_img is the path to the NIfTI file; load it
+                atlas_img = nib.load(schaefer["maps"])
+                labels = schaefer["labels"]  # list of labels
+                labels = [label.decode() for label in labels]
+                # check that the labels match BOLD.node_labels
+                assert all(
+                    i == j for i, j in zip(labels, BOLD.node_labels)
+                ), "Labels do not match!"
 
-            # Create a small sphere for each coordinate
-            radius = 5  # in voxels
-            for c, d in zip(coords, avg_d_values):
-                ijk = np.round(nib.affines.apply_affine(np.linalg.inv(affine), c)).astype(
-                    int
+                atlas_data = atlas_img.get_fdata()
+                cohen_img_data = np.zeros(atlas_data.shape)
+
+                for i, d in enumerate(avg_d_values):
+                    cohen_img_data[atlas_data == (i + 1)] = d  # labels start from 1
+
+                cohen_img = nib.Nifti1Image(cohen_img_data, affine=atlas_img.affine)
+
+                plotting.plot_glass_brain(
+                    cohen_img,
+                    display_mode="ortho",
+                    colorbar=True,
+                    cmap="coolwarm",
+                    plot_abs=False,
+                    vmax=np.max(avg_d_values),
                 )
-                x, y, z = ijk
-                for i in range(-radius, radius + 1):
-                    for j in range(-radius, radius + 1):
-                        for k in range(-radius, radius + 1):
-                            if i**2 + j**2 + k**2 <= radius**2:
-                                xi, yj, zk = x + i, y + j, z + k
-                                if (
-                                    (0 <= xi < data.shape[0])
-                                    and (0 <= yj < data.shape[1])
-                                    and (0 <= zk < data.shape[2])
-                                ):
-                                    data[xi, yj, zk] = d
-
-            d_img = nib.Nifti1Image(data, affine)
-
-            plotting.plot_glass_brain(
-                d_img,
-                display_mode="ortho",
-                colorbar=True,
-                plot_abs=False,
-                cmap="coolwarm",
-                vmax=np.max(avg_d_values),
-            )
-
-            plt.savefig(
-                f"{output_root}/cohensd_region_{task}.png",
-                dpi=120,
-                bbox_inches="tight",
-                pad_inches=0.1,
-                format="png",
-            )
-
-            plt.close()
-
-            # Load Schaefer atlas (100 parcels)
-            schaefer = datasets.fetch_atlas_schaefer_2018(n_rois=100)
-
-            # atlas_img is the path to the NIfTI file; load it
-            atlas_img = nib.load(schaefer["maps"])
-            labels = schaefer["labels"]  # list of labels
-            labels = [label.decode() for label in labels]
-            # check that the labels match BOLD.node_labels
-            assert all(
-                i == j for i, j in zip(labels, BOLD.node_labels)
-            ), "Labels do not match!"
-
-            atlas_data = atlas_img.get_fdata()
-            cohen_img_data = np.zeros(atlas_data.shape)
-
-            for i, d in enumerate(avg_d_values):
-                cohen_img_data[atlas_data == (i + 1)] = d  # labels start from 1
-
-            cohen_img = nib.Nifti1Image(cohen_img_data, affine=atlas_img.affine)
-
-            plotting.plot_glass_brain(
-                cohen_img,
-                display_mode="ortho",
-                colorbar=True,
-                cmap="coolwarm",
-                plot_abs=False,
-                vmax=np.max(avg_d_values),
-            )
-
-            plt.savefig(
-                f"{output_root}/cohensd_voxel_{task}.png",
-                dpi=120,
-                bbox_inches="tight",
-                pad_inches=0.1,
-                format="png",
-            )
-
-            plt.close()
+
+                plt.savefig(
+                    f"{output_root}/cohensd_voxel_{task}.png",
+                    dpi=120,
+                    bbox_inches="tight",
+                    pad_inches=0.1,
+                    format="png",
+                )
+
+                plt.close()
 
     # --- Across-task correlation with ML performance (ABSOLUTE Cohen's d) ---
     # Load ALL_ML_SCORES
     ALL_ML_SCORES = np.load(
-        f"{multi_dataset_info['output_root']}/ML_results/ALL_ML_SCORES_real.npy",
+        f"{multi_dataset_info['output_root']}/ML_results/ALL_ML_SCORES_{simul_or_real}.npy",
         allow_pickle=True,
     ).item()
 
diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index 3bd6a4d..4a11075 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -30,12 +30,9 @@ if [ ! -f "$SCRIPT_PATH" ]; then
 fi
 
 case "$SCRIPT_NAME" in
-  performance_predict.py | ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py)
+  performance_predict.py | ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py | cohensd.py)
     python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
     ;;
-  cohensd.py)
-    python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO"
-    ;;
   *)
     echo "Unknown script: $SCRIPT_NAME"
     exit 1

From 9c3cceac2509eeda85ded57c889b80d486e12730 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 28 Jan 2026 18:08:25 -0500
Subject: [PATCH 331/401] add procrustes_limit and n_neighbors_upper limit

---
 pydfc/ml_utils.py | 157 +++++++++++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 64 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index e3bc4ac..3754dd6 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -575,60 +575,71 @@ def generalized_procrustes(X_embed_dict, max_iter=1000, tol=1e-6):
     raise RuntimeError("Generalized Procrustes Analysis did not converge.")
 
 
-def twonn(X, discard_ratio=0.1):
+def twonn(X, discard_ratio=0.1, n_neighbors=30, eps=1e-12, metric="euclidean"):
     """
-    Calculates intrinsic dimension of the provided data points with the TWO-NN algorithm.
+    TWO-NN intrinsic dimension estimator.
 
-    -----------
-    Parameters:
-
-    X : 2d array-like
-        (n_samples, n_features)
-    discard_fraction : float between 0 and 1
-        Fraction of largest distances to discard (heuristic from the paper)
-
-    Returns:
+    Parameters
+    ----------
+    X : (n_samples, n_features)
+    discard_ratio : float in [0,1)
+        Fraction of largest mu values to discard (tail trimming).
+    n_neighbors : int
+        Number of neighbors to query (must be >= 3 ideally, and <= n_samples-1).
+    eps : float
+        Numerical tolerance for filtering mu values.
+    metric : str
+        Distance metric for NearestNeighbors.
 
+    Returns
+    -------
     d : float
-        Intrinsic dimension of the dataset according to TWO-NN.
+        Estimated intrinsic dimension.
     """
+    X = np.asarray(X)
+    n = X.shape[0]
+    if n < 5:
+        raise ValueError("TWO-NN needs more samples (n >= 5 is a practical minimum).")
 
-    num_samples = X.shape[0]
-
-    NN = NearestNeighbors(n_neighbors=30)
-    NN.fit(X)
-    distances, _ = NN.kneighbors(return_distance=True)
-
-    mu = np.zeros((num_samples))
-    for i in range(num_samples):
-        # find the two nearest neighbors that have
-        # different distances and the distance is not 0
-        r1, r2 = None, None
-        for j in range(distances.shape[1]):
-            if distances[i, j] != 0:
-                if r1 is None:
-                    r1 = distances[i, j]
-                elif distances[i, j] != r1:
-                    r2 = distances[i, j]
-                    break
-        if r1 is not None and r2 is not None:
+    k = int(min(max(n_neighbors, 3), n - 1))  # at least 3, at most n-1
+
+    nn = NearestNeighbors(n_neighbors=k, metric=metric)
+    nn.fit(X)
+    distances, _ = nn.kneighbors(X, return_distance=True)
+
+    mu = np.full(n, np.nan, dtype=float)
+
+    for i in range(n):
+        # distances[i, 0] is typically 0 (self). Find first two *positive* distances
+        pos = distances[i][distances[i] > eps]
+        if pos.size >= 2:
+            r1, r2 = pos[0], pos[1]
             mu[i] = r2 / r1
-        else:
-            mu[i] = np.nan
 
-    # discard NaN values
-    mu = mu[~np.isnan(mu)]
-    # large distances will cause the estimation to be biased, discard them
-    mu = mu[np.argsort(mu)[: int((1 - discard_ratio) * num_samples)]]
+    mu = mu[np.isfinite(mu)]
+    mu = mu[mu > 1.0 + eps]  # avoid log(1)=0 edge cases
 
-    # CDF
-    CDF = np.arange(1, 1 + len(mu)) / num_samples
-    # Fit the formula: log(1 - CDF) = d * log(mu)
-    lr = LinearRegression(fit_intercept=False)
-    lr.fit(np.log(mu).reshape(-1, 1), -np.log(1 - CDF).reshape(-1, 1))
-    d = lr.coef_[0][0]
+    if mu.size < 5:
+        raise ValueError(
+            "Too few valid mu values after filtering; check duplicates / ties / eps."
+        )
+
+    # discard upper tail (largest mu)
+    mu.sort()
+    keep = int(np.floor((1.0 - discard_ratio) * mu.size))
+    keep = max(5, keep)  # don't keep too few
+    mu = mu[:keep]
+
+    N = mu.size
+    # plotting positions; i/(N+1) is common and avoids CDF=1 exactly
+    F = np.arange(1, N + 1) / (N + 1.0)
+
+    x = np.log(mu).reshape(-1, 1)
+    y = (-np.log(1.0 - F)).reshape(-1, 1)
 
-    return d
+    lr = LinearRegression(fit_intercept=False)
+    lr.fit(x, y)
+    return float(lr.coef_[0, 0])
 
 
 def SI_ID(
@@ -724,8 +735,10 @@ def find_intrinsic_dim(
         intrinsic_dim_all = list()
         for subject in subjects:
             X_subj = X[subj_label == subject, :]
-            intrinsic_dim_all.append(twonn(X_subj, discard_ratio=0.1))
-        intrinsic_dim = int(np.mean(intrinsic_dim_all))
+            intrinsic_dim_all.append(
+                twonn(X_subj, discard_ratio=0.1, metric="correlation")
+            )
+        intrinsic_dim = int(np.median(intrinsic_dim_all))
     return intrinsic_dim
 
 
@@ -735,20 +748,14 @@ def LE_transform(X, n_components, n_neighbors, distance_metric="euclidean"):
 
     if n_neighbors >= n_samples, n_neighbors will be changed to the lower limit n_neighbors
     """
-    min_n_neighbors = 70
+    n_neighbors_upper = int(X.shape[0] / 8)
 
-    if n_neighbors >= X.shape[0]:
-        if min_n_neighbors >= X.shape[0]:
-            n_neighbors_to_be_used = int(X.shape[0] * 2 / 3)
-            warnings.warn(
-                f"number of samples is less than {min_n_neighbors}. n_neighbors is set to {n_neighbors_to_be_used}."
-            )
-        else:
-            n_neighbors_to_be_used = min_n_neighbors
-            # raise a warning
-            warnings.warn(
-                f"n_neighbors is larger than the number of samples. n_neighbors is set to the minimum value of {min_n_neighbors}."
-            )
+    if n_neighbors > n_neighbors_upper:
+        n_neighbors_to_be_used = n_neighbors_upper
+        # raise a warning
+        warnings.warn(
+            f"n_neighbors is larger than the limit. n_neighbors is set to {n_neighbors_to_be_used}."
+        )
     else:
         n_neighbors_to_be_used = n_neighbors
 
@@ -822,6 +829,12 @@ def LE_embed_procustes(
     n_neighbors_LE=125,
     procruste_method="best_SI",
 ):
+    procrustes_limit = int(np.sqrt(2 * X_train.shape[0]))
+    if n_components > procrustes_limit:
+        warnings.warn(
+            f"n_components ({n_components}) is larger than the limit for procrustes method ({procrustes_limit}). Setting n_components to {procrustes_limit}."
+        )
+        n_components = procrustes_limit - 1
     if procruste_method == "best_SI":
         # first embed the dFC features of each subject into a lower dimensional space using LE separately
         embed_dict = {}
@@ -1262,12 +1275,28 @@ def embed_dFC_features(
                 LE_embedding_method = "concat+embed"
         # if n_components is not specified, find the intrinsic dimension of the data using training set and based on the silhouette score
         if n_components == "auto":
-            if X_train.shape[1] < 7:
-                search_range_SI = range(2, X_train.shape[1] + 1)
-            elif X_train.shape[1] < 24:
-                search_range_SI = range(2, X_train.shape[1] + 1, 2)
+            if LE_embedding_method == "embed+procrustes":
+                # find the list of time lengths across subjects
+                n_time_across_subj = [
+                    np.sum(subj_label_train == subj) for subj in train_subjects
+                ]
+                # find the minimum time length across subjects
+                min_time_length = min(n_time_across_subj)
+                # set the search range based on the minimum time length
+                procrustes_limit = int(np.sqrt(2 * min_time_length))
+                if procrustes_limit < 50 and procrustes_limit > 10:
+                    search_range_SI = range(2, procrustes_limit, 2)
+                elif procrustes_limit <= 10:
+                    search_range_SI = range(2, procrustes_limit)
+                else:
+                    search_range_SI = range(2, 50, 5)
             else:
-                search_range_SI = range(2, 50, 5)
+                if X_train.shape[0] < 7:
+                    search_range_SI = range(2, X_train.shape[1] + 1)
+                elif X_train.shape[1] < 24:
+                    search_range_SI = range(2, X_train.shape[1] + 1, 2)
+                else:
+                    search_range_SI = range(2, 50, 5)
             n_components = find_intrinsic_dim(
                 X=X_train,
                 y=y_train,

From f3291701ea8cdb7d620ce11dc0069c71794b594b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 28 Jan 2026 23:05:31 -0500
Subject: [PATCH 332/401] add localPCA

---
 pydfc/ml_utils.py | 202 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 194 insertions(+), 8 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 3754dd6..e9c1f65 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -689,6 +689,143 @@ def SI_ID(
     return intrinsic_dim
 
 
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+
+
+def localpca_intrinsic_dim(
+    X,
+    k=20,
+    method="explained_var",  # "explained_var" or "eigengap"
+    var_threshold=0.9,  # used for explained_var
+    max_dim=None,  # cap returned dim (optional)
+    center=True,
+    metric="euclidean",
+    random_state=0,
+    agg="median",  # "median", "mean", "trimmed_mean"
+    trim=0.1,  # used if agg="trimmed_mean"
+    eps=1e-12,
+):
+    """
+    Local PCA intrinsic dimension estimation.
+
+    Parameters
+    ----------
+    X : (n_samples, n_features)
+    k : int
+        Neighborhood size (kNN). Must be < n_samples.
+    method : str
+        "explained_var": choose smallest d achieving cumulative variance >= var_threshold
+        "eigengap": choose d maximizing eigenvalue ratio lambda_d / lambda_{d+1}
+    var_threshold : float
+        Threshold for explained_var method.
+    max_dim : int or None
+        Max dimension to consider/return; defaults to min(n_features, k-1).
+    center : bool
+        Whether to mean-center each neighborhood before PCA.
+    metric : str
+        Metric for kNN graph.
+    agg : str
+        Aggregation across points: "median", "mean", "trimmed_mean"
+    trim : float
+        Trimming fraction for trimmed_mean.
+    eps : float
+        Numerical stability.
+
+    Returns
+    -------
+    d_global : float
+        Aggregated intrinsic dimension estimate.
+    d_local : (n_samples,) int
+        Local dimension estimates.
+    """
+    X = np.asarray(X, dtype=float)
+    n, D = X.shape
+    if n < 5:
+        raise ValueError("Need more samples for localPCA ID.")
+    if k >= n:
+        raise ValueError(f"k must be < n_samples (got k={k}, n={n}).")
+
+    # Choose max_dim limit
+    max_possible = min(D, k - 1)  # local covariance rank limited by k-1 if centered
+    if max_dim is None:
+        max_dim = max_possible
+    else:
+        max_dim = int(min(max_dim, max_possible))
+        max_dim = max(1, max_dim)
+
+    # kNN indices (exclude self by requesting k+1 and dropping first)
+    nn = NearestNeighbors(n_neighbors=k + 1, metric=metric)
+    nn.fit(X)
+    _, idx = nn.kneighbors(X, return_distance=True)
+    nbrs = idx[:, 1:]  # (n, k)
+
+    d_local = np.zeros(n, dtype=int)
+
+    for i in range(n):
+        Xi = X[nbrs[i]]  # (k, D)
+        if center:
+            Xi = Xi - Xi.mean(axis=0, keepdims=True)
+
+        # PCA via SVD of neighborhood matrix
+        # Xi = U S Vt ; singular values S relate to eigenvalues of covariance
+        # covariance eigenvalues proportional to (S^2) / (k-1)
+        # we can work directly with S^2
+        try:
+            # full_matrices=False keeps it fast
+            _, S, _ = np.linalg.svd(Xi, full_matrices=False)
+        except np.linalg.LinAlgError:
+            d_local[i] = 1
+            continue
+
+        lam = S**2  # proportional to variance along PCs
+        if lam.size == 0:
+            d_local[i] = 1
+            continue
+
+        lam = lam[: max_dim + 1]  # for eigengap need d and d+1
+        lam = np.maximum(lam, eps)
+
+        if method == "explained_var":
+            lam_use = lam[:max_dim]
+            cum = np.cumsum(lam_use)
+            total = cum[-1]
+            if total <= eps:
+                d_local[i] = 1
+            else:
+                frac = cum / total
+                d_local[i] = int(np.searchsorted(frac, var_threshold) + 1)
+
+        elif method == "eigengap":
+            # need ratios up to max_dim-1: lam[d-1]/lam[d]
+            lam_use = lam[: max_dim + 1]  # ensures lam[d] exists
+            if lam_use.size < 2:
+                d_local[i] = 1
+            else:
+                ratios = lam_use[:-1] / lam_use[1:]
+                # pick d that maximizes ratio, d in [1..max_dim]
+                d_local[i] = int(np.argmax(ratios) + 1)
+        else:
+            raise ValueError(f"Unknown method: {method}")
+
+    # aggregate
+    if agg == "median":
+        d_global = float(np.median(d_local))
+    elif agg == "mean":
+        d_global = float(np.mean(d_local))
+    elif agg == "trimmed_mean":
+        d_sorted = np.sort(d_local)
+        m = len(d_sorted)
+        lo = int(np.floor(trim * m))
+        hi = int(np.ceil((1 - trim) * m))
+        hi = max(hi, lo + 1)
+        d_global = float(np.mean(d_sorted[lo:hi]))
+    else:
+        raise ValueError(f"Unknown agg: {agg}")
+
+    return d_global, d_local
+
+
 def find_intrinsic_dim(
     X,
     y,
@@ -704,7 +841,7 @@ def find_intrinsic_dim(
     Find the number of components to use for embedding the data using LE.
     Find the average intrinsic dimension across all subjects.
 
-    method: "SI" or "twonn"
+    method: "SI" or "twonn" or "localpca"
 
     Returns:
     intrinsic_dim: number of components to use for embedding
@@ -739,6 +876,35 @@ def find_intrinsic_dim(
                 twonn(X_subj, discard_ratio=0.1, metric="correlation")
             )
         intrinsic_dim = int(np.median(intrinsic_dim_all))
+    elif method == "localpca":
+        intrinsic_dim_all = list()
+        for subject in subjects:
+            X_subj = X[subj_label == subject, :]
+            intrinsic_dim_diff_k = list()
+            # seatryrch 0.2 * X_subj.shape[0] and 0.3 * X_subj.shape[0] for k
+            for k in range(
+                int(0.1 * X_subj.shape[0]),
+                int(0.3 * X_subj.shape[0]),
+                5,
+            ):
+                try:
+                    d_global, _ = localpca_intrinsic_dim(
+                        X_subj,
+                        k=k,
+                        method="explained_var",
+                        var_threshold=0.9,
+                        center=True,
+                        metric="correlation",
+                        agg="median",
+                    )
+                    intrinsic_dim_diff_k.append(d_global)
+                except Exception as e:
+                    warnings.warn(
+                        f"Error in localpca_intrinsic_dim for subject {subject} with k={k}: {e}."
+                    )
+                    continue
+            intrinsic_dim_all.append(int(np.mean(intrinsic_dim_diff_k)))
+        intrinsic_dim = int(np.median(intrinsic_dim_all))
     return intrinsic_dim
 
 
@@ -1242,9 +1408,19 @@ def embed_dFC_features(
                     X=X_train_c,
                     y=y_train,
                     groups=subj_label_train,
-                    n_list=range(
-                        10, 60, 10
-                    ),  # you can adjust this range based on your data
+                    n_list=[
+                        2,
+                        3,
+                        4,
+                        5,
+                        10,
+                        15,
+                        20,
+                        25,
+                        30,
+                        40,
+                        50,
+                    ],  # you can adjust this range based on your data
                     cv=5,  # more stable
                     score="r2",
                 )
@@ -1253,9 +1429,19 @@ def embed_dFC_features(
                     X=X_train_c,
                     y=y_train,
                     groups=subj_label_train,
-                    n_list=range(
-                        10, 60, 10
-                    ),  # you can adjust this range based on your data
+                    n_list=[
+                        2,
+                        3,
+                        4,
+                        5,
+                        10,
+                        15,
+                        20,
+                        25,
+                        30,
+                        40,
+                        50,
+                    ],  # you can adjust this range based on your data
                     cv=5,  # more stable
                 )
             n_components = best_n
@@ -1302,7 +1488,7 @@ def embed_dFC_features(
                 y=y_train,
                 subj_label=subj_label_train,
                 subjects=train_subjects,
-                method="SI",
+                method="localpca",
                 n_neighbors_LE=n_neighbors_LE,
                 search_range_SI=search_range_SI,
                 LE_embedding_method=LE_embedding_method,

From e9f84b23bcb0badc5ee1be1bafa9139e2d4a2183 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 29 Jan 2026 11:32:42 -0500
Subject: [PATCH 333/401] PLS PCA and LE for embed

---
 pydfc/ml_utils.py | 233 +---------------------------------------------
 task_dFC/ML.py    | 175 +---------------------------------
 2 files changed, 6 insertions(+), 402 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index e9c1f65..2d4b21d 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -2076,9 +2076,10 @@ def task_presence_classification(
         },
     }
 
-    check_count = 2
+    EMBEDDINGS = ["PCA", "PLS", "LE"]
+    check_count = len(EMBEDDINGS)
     num_excluded_subjects = 0
-    for embedding in ["PCA", "PLS"]:
+    for embedding in EMBEDDINGS:
         if measure_is_state_based:
             X_train_embedded = process_SB_features(X=X_train, measure_name=measure_name)
             X_test_embedded = process_SB_features(X=X_test, measure_name=measure_name)
@@ -2257,7 +2258,7 @@ def task_presence_classification(
                 len(ML_scores["group_lvl"][key]) == L
             ), f"Length of {key} is not equal to others."
 
-    # L is supposed to be equal to 2 embeddings (PCA and LE) * 2 groups (train and test)
+    # L is supposed to be equal to 3 embeddings (PCA, PLS, and LE) * 2 groups (train and test)
     assert (
         L == check_count * 2
     ), f"Length of group_lvl is not equal to {check_count * 2}, but {L}."
@@ -2271,233 +2272,9 @@ def task_presence_classification(
                 len(ML_scores["subj_lvl"][key]) == L
             ), f"Length of {key} is not equal to others."
 
-    # L is supposed to be equal to number of subjects * 2 embeddings (PCA and LE)
+    # L is supposed to be equal to number of subjects * 3 embeddings (PCA, PLS, and LE)
     assert (
         L == len(SUBJECTS) * check_count - num_excluded_subjects
     ), f"Length of subj_lvl is not equal to {len(SUBJECTS) * check_count - num_excluded_subjects}, but {L}."
 
     return ML_scores
-
-
-################################# Clustering Framework Functions ####################################
-
-
-def task_presence_clustering(
-    task,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    run=None,
-    session=None,
-    normalize_dFC=True,
-):
-    if run is None:
-        print(f"=============== {task} ===============")
-    else:
-        print(f"=============== {task} {run} ===============")
-
-    if task == "task-restingstate":
-        return
-
-    SUBJECTS = find_available_subjects(
-        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
-    )
-
-    print(f"Number of subjects: {len(SUBJECTS)}")
-
-    X, _, y, _, subj_label, _, measure_name = dFC_feature_extraction(
-        task=task,
-        train_subjects=SUBJECTS,
-        test_subjects=[],
-        dFC_id=dFC_id,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        run=run,
-        session=session,
-        dynamic_pred="no",
-        normalize_dFC=normalize_dFC,
-        FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
-    )
-
-    clustering_RESULTS = {"PCA": {}, "PLS": {}}
-    clustering_scores = {
-        "subj_id": list(),
-        "task": list(),
-        "run": list(),
-        "dFC method": list(),
-        "Kmeans ARI": list(),
-        "SI": list(),
-        "embedding": list(),
-    }
-    for embedding in ["PCA", "PLS"]:
-        # embed dFC features
-        # if the number of features is smaller than 25, we assume that dimensionality reduction is not needed
-        # specially for state-based dFC features, the number of features is equal to the number of states
-        if X.shape[1] < 25:
-            X_embedded = X
-            print(
-                f"Number of features is {X.shape[1]}. No dimensionality reduction is applied."
-            )
-        else:
-            try:
-                X_embedded, _ = embed_dFC_features(
-                    train_subjects=SUBJECTS,
-                    test_subjects=[],
-                    X_train=X,
-                    X_test=None,
-                    y_train=y,
-                    y_test=None,
-                    subj_label_train=subj_label,
-                    subj_label_test=None,
-                    embedding=embedding,
-                    n_components="auto",
-                    n_neighbors_LE=125,
-                    LE_embedding_method="embed+procrustes",
-                )
-            except:
-                continue
-
-        # clustering
-        # apply kmeans clustering to dFC features
-
-        n_clusters = 2  # corresponding to task and rest
-
-        scaler = StandardScaler()
-        X_normalized = scaler.fit_transform(X_embedded)
-        kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-        labels_pred = kmeans.fit_predict(X_normalized)
-
-        # ARI score
-        print(f"ARI score: {adjusted_rand_score(y, labels_pred)}")
-
-        # # visualize clustering centroids
-        # centroids = kmeans.cluster_centers_
-        # centroids = pca.inverse_transform(centroids)
-        # centroids = scaler.inverse_transform(centroids)
-        # n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-        # centroids_mat = dFC_vec2mat(centroids, n_regions)
-
-        clustering_RESULTS[embedding] = {
-            "StandardScaler": scaler,
-            "kmeans": kmeans,
-            "ARI": adjusted_rand_score(y, labels_pred),
-            # "centroids": centroids_mat,
-        }
-
-        for subj in SUBJECTS:
-            clustering_scores["subj_id"].append(subj)
-            features = X_embedded[subj_label == subj, :]
-            target = y[subj_label == subj]
-
-            features_normalized = scaler.transform(features)
-            pred_kmeans = kmeans.predict(features_normalized)
-
-            clustering_scores["Kmeans ARI"].append(
-                adjusted_rand_score(target, pred_kmeans)
-            )
-
-            # silhouette score in terms of separability of original labels, not the clustering labels
-            clustering_scores["SI"].append(silhouette_score(features, target))
-
-            clustering_scores["task"].append(task)
-            clustering_scores["run"].append(run)
-            clustering_scores["dFC method"].append(measure_name)
-            clustering_scores["embedding"].append(embedding)
-
-    return clustering_RESULTS, clustering_scores
-
-
-def co_occurrence(task_labels, clstr_labels):
-    """
-    Calculate the co-occurrence between task labels and clustering labels.
-    """
-    co_occurrence_matrix = np.zeros(
-        (len(np.unique(task_labels)), len(np.unique(clstr_labels)))
-    )
-    for i, task_label in enumerate(np.unique(task_labels)):
-        for j, clstr_label in enumerate(np.unique(clstr_labels)):
-            co_occurrence_matrix[i, j] = np.sum(
-                (task_labels == task_label) & (clstr_labels == clstr_label)
-            )
-
-    # now find the percentage of time each cluster label was present in each task label
-    cluster_label_percentage = (
-        co_occurrence_matrix / np.sum(co_occurrence_matrix, axis=1)[:, None]
-    )
-    # make sure that the sum of each row is 1
-    assert np.allclose(
-        np.sum(cluster_label_percentage, axis=1), 1
-    ), "Sum of each row is not 1."
-
-    # now find the percentage of time each task label occupied each cluster label
-    task_label_percentage = (
-        co_occurrence_matrix / np.sum(co_occurrence_matrix, axis=0)[None, :]
-    )
-    # make sure that the sum of each column is 1
-    assert np.allclose(
-        np.sum(task_label_percentage, axis=0), 1
-    ), "Sum of each column is not 1."
-
-    return co_occurrence_matrix, cluster_label_percentage, task_label_percentage
-
-
-def cluster_for_visual(
-    task,
-    dFC_id,
-    roi_root,
-    dFC_root,
-    run=None,
-    session=None,
-    normalize_dFC=True,
-):
-    if run is None:
-        print(f"=============== {task} ===============")
-    else:
-        print(f"=============== {task} {run} ===============")
-
-    SUBJECTS = find_available_subjects(
-        dFC_root=dFC_root, task=task, run=run, session=session, dFC_id=dFC_id
-    )
-
-    print(f"Number of subjects: {len(SUBJECTS)}")
-
-    X, _, y, _, _, _, measure_name = dFC_feature_extraction(
-        task=task,
-        train_subjects=SUBJECTS,
-        test_subjects=[],
-        dFC_id=dFC_id,
-        roi_root=roi_root,
-        dFC_root=dFC_root,
-        run=run,
-        session=session,
-        dynamic_pred="no",
-        normalize_dFC=normalize_dFC,
-        FCS_proba_for_SB=False,
-    )
-
-    # clustering
-    # apply kmeans clustering to dFC features
-    n_clusters = 5
-
-    kmeans = KMeans(init="k-means++", n_clusters=n_clusters, n_init=5)
-    clstr_labels = kmeans.fit_predict(X)  # clstr_labels = (n_samples,)
-
-    # calculate the co-occurrence matrix
-    co_occurrence_matrix, cluster_label_percentage, task_label_percentage = co_occurrence(
-        y, clstr_labels
-    )
-
-    # get centroids
-    centroids = kmeans.cluster_centers_
-    n_regions = int((1 + np.sqrt(1 + 8 * centroids.shape[1])) / 2)
-    centroids_mat = dFC_vec2mat(
-        centroids, n_regions
-    )  # shape: n_clusters x n_regions x n_regions
-
-    return (
-        centroids_mat,
-        measure_name,
-        co_occurrence_matrix,
-        cluster_label_percentage,
-        task_label_percentage,
-    )
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index e015bcd..136c3ef 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -6,12 +6,7 @@
 import numpy as np
 from joblib import Parallel, delayed
 
-from pydfc.ml_utils import (
-    cluster_for_visual,
-    extract_task_features,
-    task_presence_classification,
-    task_presence_clustering,
-)
+from pydfc.ml_utils import extract_task_features, task_presence_classification
 
 os.environ["MKL_NUM_THREADS"] = "1"
 os.environ["NUMEXPR_NUM_THREADS"] = "1"
@@ -153,139 +148,6 @@ def run_classification(
         np.save(f"{folder}/ML_scores_classify_{dFC_id}.npy", ML_scores)
 
 
-def run_clustering(
-    dFC_id,
-    TASKS,
-    RUNS,
-    SESSIONS,
-    roi_root,
-    dFC_root,
-    output_root,
-    normalize_dFC=True,
-):
-    for session in SESSIONS:
-        if not session is None:
-            print(f"=================== {session} ===================")
-        clustering_scores = {
-            "subj_id": list(),
-            "task": list(),
-            "run": list(),
-            "dFC method": list(),
-            "Kmeans ARI": list(),
-            "SI": list(),
-            "embedding": list(),
-        }
-
-        clustering_RESULTS = {}
-        for task_id, task in enumerate(TASKS):
-            clustering_RESULTS[task] = {}
-            for run in RUNS[task]:
-                try:
-                    clustering_RESULTS_new, clustering_scores_new = (
-                        task_presence_clustering(
-                            task=task,
-                            dFC_id=dFC_id,
-                            roi_root=roi_root,
-                            dFC_root=dFC_root,
-                            run=run,
-                            session=session,
-                            normalize_dFC=normalize_dFC,
-                        )
-                    )
-                    if run is None:
-                        clustering_RESULTS[task] = clustering_RESULTS_new
-                    else:
-                        clustering_RESULTS[task][run] = clustering_RESULTS_new
-                    for key in clustering_scores:
-                        clustering_scores[key].extend(clustering_scores_new[key])
-                except Exception as e:
-                    print(
-                        f"Error in task presence clustering for {session} {task} {run}: {e}"
-                    )
-                    traceback.print_exc()
-
-        if session is None:
-            folder = f"{output_root}/clustering"
-        else:
-            folder = f"{output_root}/clustering/{session}"
-        try:
-            if not os.path.exists(folder):
-                os.makedirs(folder)
-        except OSError as err:
-            print(err)
-        np.save(f"{folder}/clustering_RESULTS_{dFC_id}.npy", clustering_RESULTS)
-
-        np.save(f"{folder}/clustering_scores_{dFC_id}.npy", clustering_scores)
-
-
-def run_clustering_for_visual(
-    dFC_id,
-    TASKS,
-    RUNS,
-    SESSIONS,
-    roi_root,
-    dFC_root,
-    output_root,
-    normalize_dFC=True,
-):
-    for session in SESSIONS:
-        if not session is None:
-            print(f"=================== {session} ===================")
-
-        for task_id, task in enumerate(TASKS):
-            for run in RUNS[task]:
-                try:
-                    (
-                        centroids_mat,
-                        measure_name,
-                        co_occurrence_matrix,
-                        cluster_label_percentage,
-                        task_label_percentage,
-                    ) = cluster_for_visual(
-                        task=task,
-                        dFC_id=dFC_id,
-                        roi_root=roi_root,
-                        dFC_root=dFC_root,
-                        run=run,
-                        session=session,
-                        normalize_dFC=normalize_dFC,
-                    )
-
-                    centroids = {
-                        "centroids_mat": centroids_mat,
-                        "co_occurrence_matrix": co_occurrence_matrix,
-                        "cluster_label_percentage": cluster_label_percentage,
-                        "task_label_percentage": task_label_percentage,
-                    }
-
-                    # save the centroids
-                    suffix = "centroids"
-                    if session is not None:
-                        suffix = f"{suffix}_{session}"
-                    suffix = f"{suffix}_{task}"
-                    if run is not None:
-                        suffix = f"{suffix}_{run}"
-                    suffix = f"{suffix}_{measure_name}"
-
-                    if session is None:
-                        folder = f"{output_root}/centroids"
-                    else:
-                        folder = f"{output_root}/centroids/{session}"
-                    if not os.path.exists(folder):
-                        os.makedirs(folder)
-
-                    np.save(
-                        f"{folder}/{suffix}.npy",
-                        centroids,
-                    )
-
-                except Exception as e:
-                    print(
-                        f"Error in clustering for visualization for {session} {task} {run}: {e}"
-                    )
-                    traceback.print_exc()
-
-
 #######################################################################################
 
 if __name__ == "__main__":
@@ -384,41 +246,6 @@ def run_clustering_for_visual(
         print(f"Error in classification for dFC ID {dFC_id}: {e}")
         traceback.print_exc()
     print(f"Task presence classification finished for dFC ID {dFC_id}.")
-    # print(f"Task presence clustering started for dFC ID {dFC_id} ...")
-    # try:
-    #     run_clustering(
-    #         dFC_id=dFC_id,
-    #         TASKS=TASKS,
-    #         RUNS=RUNS,
-    #         SESSIONS=SESSIONS,
-    #         roi_root=roi_root,
-    #         dFC_root=dFC_root,
-    #         output_root=ML_root,
-    #         normalize_dFC=True,
-    #     )
-    # except Exception as e:
-    #     print(f"Error in clustering for dFC ID {dFC_id}: {e}")
-    #     traceback.print_exc()
-
-    # print(f"Task presence clustering finished for dFC ID {dFC_id}.")
-
-    # print(f"Clustering for visualization started for dFC ID {dFC_id} ...")
-    # try:
-    #     run_clustering_for_visual(
-    #         dFC_id=dFC_id,
-    #         TASKS=TASKS,
-    #         RUNS=RUNS,
-    #         SESSIONS=SESSIONS,
-    #         roi_root=roi_root,
-    #         dFC_root=dFC_root,
-    #         output_root=ML_root,
-    #         normalize_dFC=True,
-    #     )
-    # except Exception as e:
-    #     print(f"Error in clustering for visualization for dFC ID {dFC_id}: {e}")
-    #     traceback.print_exc()
-
-    # print(f"Clustering for visualization finished for dFC ID {dFC_id}.")
 
     print(f"Task presence prediction finished for dFC ID {dFC_id}.")
 

From e5fb0f509cd60af336d677f57e549c91fc39167a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 2 Feb 2026 10:07:14 -0500
Subject: [PATCH 334/401] remove some datasets

---
 task_dFC/run_scripts_slurm/multi_dataset_info.json | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
index cb32fd2..7f3e98c 100644
--- a/task_dFC/run_scripts_slurm/multi_dataset_info.json
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -3,11 +3,11 @@
 	"real_data": {
 		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
 		"DATASETS": [
-			"ds001242", "ds001734", "ds002236", "ds002647",
-			"ds002785", "ds002843", "ds002994", "ds003242",
-			"ds003465", "ds003612", "ds003717", "ds003823",
-			"ds004044", "ds004302", "ds004349", "ds004359",
-			"ds004556", "ds004711", "ds004746", "ds004791",
+			"ds001242", "ds002236", "ds002647",
+			"ds002843", "ds002994", "ds003242",
+			"ds003465", "ds003612", "ds003823",
+			"ds004044", "ds004349", "ds004359",
+			"ds004556", "ds004746", "ds004791",
 			"ds004848", "ds005038"
 		],
 		"TASKS_to_include": [

From 1be479c2e02b1bc678b38f4a83b1d2bf51cc04cb Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 2 Feb 2026 17:33:22 -0500
Subject: [PATCH 335/401] minor

---
 pydfc/ml_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 2d4b21d..a954b51 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -883,7 +883,7 @@ def find_intrinsic_dim(
             intrinsic_dim_diff_k = list()
             # seatryrch 0.2 * X_subj.shape[0] and 0.3 * X_subj.shape[0] for k
             for k in range(
-                int(0.1 * X_subj.shape[0]),
+                max(5, int(0.1 * X_subj.shape[0])),  # not letting go below 5
                 int(0.3 * X_subj.shape[0]),
                 5,
             ):

From 6ad14754ca91a9cb2c54011649ebc0a9bebbe140 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 6 Feb 2026 14:06:27 -0500
Subject: [PATCH 336/401] add PLS to ml_results

---
 task_dFC/multi_dataset_analysis/ml_results.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 4bae943..1fd2f18 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -159,10 +159,13 @@
     TARGETS = [
         ("PCA", "Logistic regression balanced accuracy"),
         ("LE", "Logistic regression balanced accuracy"),
+        ("PLS", "Logistic regression balanced accuracy"),
         ("PCA", "SVM balanced accuracy"),
         ("LE", "SVM balanced accuracy"),
+        ("PLS", "SVM balanced accuracy"),
         ("LE", "SI"),
         ("PCA", "SI"),
+        ("PLS", "SI"),
     ]
     # -------------------------------------------------------------------
 

From 8d67c90aec44b78932607be15fc919bde28457b9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 6 Feb 2026 14:20:21 -0500
Subject: [PATCH 337/401] minor

---
 .../sample_matrix_visualization.py            | 47 ++++---------------
 1 file changed, 9 insertions(+), 38 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 3865664..32e8da8 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -282,45 +282,16 @@
                         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
                         show=False,
                     )
-                    if (
-                        task == "task-localiser"
-                        and group == "train"
-                        and raw_or_embedded == ""
-                        and simul_or_real == "real"
-                        and (
-                            measure_name == "SlidingWindow" or measure_name == "Time-Freq"
-                        )
-                    ):
-                        plot_samples_features(
-                            X,
-                            y,
-                            sample_order="label",
-                            feature_order="original",
-                            save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.svg",
-                            show=False,
-                        )
 
-                    # C) Label + within-class clustering
-                    if group == "train":
-                        orders = plot_samples_features(
-                            X,
-                            y,
-                            sample_order="label+cluster",
-                            feature_order="original",
-                            save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
-                            show=False,
-                        )
-                    elif group == "test":
-                        # Apply the *same feature order* to test (no leakage from test):
-                        plot_samples_features(
-                            X,
-                            y,
-                            sample_order="label+cluster",  # clustering is per-split; that’s fine
-                            feature_order="original",
-                            col_order_from_train=orders["col_order"],
-                            save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-samples_{task}_{group}{raw_or_embedded}.png",
-                            show=False,
-                        )
+                    # C) clustering
+                    plot_samples_features(
+                        X,
+                        y,
+                        sample_order="cluster",
+                        feature_order="original",
+                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_clustered-samples_{task}_{group}{raw_or_embedded}.png",
+                        show=False,
+                    )
 
                     save_scalar_colorbar(
                         cmap="coolwarm",

From e47a42fab37c9683f5ac0c80c338f7303e27af92 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 8 Feb 2026 10:39:26 -0500
Subject: [PATCH 338/401] temporary

---
 .../sample_matrix_visualization.py                  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 32e8da8..39310ca 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -58,7 +58,18 @@
     if simul_or_real == "real":
         main_root = multi_dataset_info["real_data"]["main_root"]
         DATASETS = multi_dataset_info["real_data"]["DATASETS"]
-        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
+        # TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"] # temporary !!!!
+        TASKS_to_include = [
+            "task-Axcpt",
+            "task-CIC",
+            "task-Cuedts",
+            "task-feedback",
+            "task-IHG",
+            "task-matching",
+            "task-motor",
+            "task-Stern",
+            "task-Stroop",
+        ]
     elif simul_or_real == "simulated":
         main_root = multi_dataset_info["simulated_data"]["main_root"]
         DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]

From 4787068ae6044c14a532402c3206f235f0eba139 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 10 Feb 2026 12:59:35 -0500
Subject: [PATCH 339/401] update KNN_classify

---
 pydfc/ml_utils.py | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index a954b51..2e47180 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1684,23 +1684,39 @@ def SVM_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
     return RESULT
 
 
-def KNN_classify(X_train, y_train, X_test, y_test):
+def KNN_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
     """
     KNN classification
     """
-    # create a pipeline with a knn model to find the best n_neighbors
-    knn = make_pipeline(
+
+    # create a dictionary of all values we want to test for n_neighbors
+    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
+
+    # perform grid search
+    model_for_hyperparam = make_pipeline(
         StandardScaler(),
         KNeighborsClassifier(),
     )
-    # create a dictionary of all values we want to test for n_neighbors
-    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
-    # use gridsearch to test all values for n_neighbors
-    knn_gscv = GridSearchCV(knn, param_grid, cv=5)
-    # fit model to data
-    knn_gscv.fit(X_train, y_train)
 
-    n_neighbors = knn_gscv.best_params_["kneighborsclassifier__n_neighbors"]
+    # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
+    # shuffle the data to ensure time points are shuffled
+    if subj_label_train is None:
+        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
+        cv = StratifiedKFold(n_splits=3)
+    else:
+        X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
+            X_train, y_train, subj_label_train
+        )
+        cv = StratifiedGroupKFold(n_splits=3)
+    model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=cv, n_jobs=-1)
+    if subj_label_train is None:
+        model_gscv.fit(X_train_shuffled, y_train_shuffled)
+    else:
+        model_gscv.fit(
+            X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled
+        )
+
+    n_neighbors = model_gscv.best_params_["kneighborsclassifier__n_neighbors"]
 
     model = make_pipeline(
         StandardScaler(),

From db0bdfe2a19a2b91e78d0e671cba676901ec2f04 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Feb 2026 20:40:00 -0500
Subject: [PATCH 340/401] standardize before PCA and PLS, rank_norm change

---
 pydfc/ml_utils.py | 99 +++++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 37 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 2e47180..da68d37 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -265,7 +265,7 @@ def dFC_feature_extraction_subj_lvl(
         dFC_mat = dFC.get_dFC_mat()
         TR_array = dFC.TR_array
         if normalize_dFC:
-            dFC_mat = rank_norm(dFC_mat)
+            dFC_mat = rank_norm(dFC_mat, global_norm=False)
         dFC_vecs = dFC_mat2vec(dFC_mat)
 
     # event data
@@ -1218,22 +1218,24 @@ def subject_center(X, subj_labels, mode="zscore"):
     return Xc
 
 
-def select_pls_components_binary_groupcv(
+def select_num_components_binary_groupcv(
     X,
     y,
     groups,
+    embedding_method="PLS",
     n_list=(2, 5, 10, 15, 20),
     cv=3,
     random_state=0,
 ):
     """
-    Select number of PLS components using subject-aware CV.
+    Select number of PLS/PCA components using subject-aware CV.
 
     Parameters
     ----------
     X : array (n_samples, n_features)
     y : array (n_samples,) binary labels
     groups : array (n_samples,) subject IDs
+    embedding_method : "PLS" or "PCA"
     n_list : iterable of candidate n_components
     cv : number of folds
     random_state : int
@@ -1241,7 +1243,7 @@ def select_pls_components_binary_groupcv(
     Returns
     -------
     best_n : int
-        Selected number of PLS components
+        Selected number of PLS/PCA components
     best_score : float
         Mean CV balanced accuracy
     """
@@ -1260,10 +1262,15 @@ def select_pls_components_binary_groupcv(
         fold_scores = []
 
         for tr, va in cv_splitter.split(X, y, groups):
-            # ---- PLS embedding (trained ONLY on train fold subjects)
-            emb = PLSEmbedder(n_components=n, scale=True)
-            Ztr = emb.fit_transform(X[tr], y[tr])
-            Zva = emb.transform(X[va])
+            # ---- embedding (trained ONLY on train fold subjects)
+            if embedding_method == "PCA":
+                emb = PCA(n_components=n, svd_solver="full", whiten=False)
+                Ztr = emb.fit_transform(X[tr])
+                Zva = emb.transform(X[va])
+            elif embedding_method == "PLS":
+                emb = PLSEmbedder(n_components=n, scale=True)
+                Ztr = emb.fit_transform(X[tr], y[tr])
+                Zva = emb.transform(X[va])
 
             # ---- classifier in latent space
             clf = make_pipeline(
@@ -1284,22 +1291,24 @@ def select_pls_components_binary_groupcv(
     return best_n, best_score
 
 
-def select_pls_components_continuous_groupcv(
+def select_num_components_continuous_groupcv(
     X,
     y,
     groups,
+    embedding_method="PLS",
     n_list=(2, 5, 10, 15, 20),
     cv=3,
     score="r2",  # "r2" or "neg_mse"
 ):
     """
-    Select number of PLS components using subject-aware CV for a CONTINUOUS target.
+    Select number of PLS/PCA components using subject-aware CV for a CONTINUOUS target.
 
     Parameters
     ----------
     X : array (n_samples, n_features)
     y : array (n_samples,) continuous target
     groups : array (n_samples,) subject IDs
+    embedding_method : "PLS" or "PCA"
     n_list : iterable of candidate n_components
     cv : number of folds
     score : "r2" or "neg_mse"
@@ -1307,7 +1316,7 @@ def select_pls_components_continuous_groupcv(
     Returns
     -------
     best_n : int
-        Selected number of PLS components
+        Selected number of PLS/PCA components
     best_score : float
         Mean CV score (higher is better)
         - R² if score="r2"
@@ -1329,12 +1338,16 @@ def select_pls_components_continuous_groupcv(
         fold_scores = []
 
         for tr, va in cv_splitter.split(X, y, groups):
-            # ---- PLS embedding (trained ONLY on train fold subjects)
-            emb = PLSEmbedder(n_components=n, scale=True)
-            # PLSRegression expects y 2D
-            Ztr = emb.fit_transform(X[tr], y[tr].reshape(-1, 1))
-            Zva = emb.transform(X[va])
-
+            # ---- embedding (trained ONLY on train fold subjects)
+            if embedding_method == "PCA":
+                emb = PCA(n_components=n, svd_solver="full", whiten=False)
+                Ztr = emb.fit_transform(X[tr])
+                Zva = emb.transform(X[va])
+            elif embedding_method == "PLS":
+                emb = PLSEmbedder(n_components=n, scale=True)
+                # PLSRegression expects y 2D
+                Ztr = emb.fit_transform(X[tr], y[tr].reshape(-1, 1))
+                Zva = emb.transform(X[va])
             # ---- regressor in latent space
             reg = make_pipeline(
                 StandardScaler(),
@@ -1385,29 +1398,28 @@ def embed_dFC_features(
     if X_test is not None:
         X_test = X_test.copy()
 
-    if embedding == "PCA":
-        # if n_components is not specified, use 95% of the variance
-        if n_components == "auto":
-            pca = PCA(n_components=0.95, svd_solver="full", whiten=False)
+    # preprocess the data by standardizing it
+    if embedding in ("PCA", "PLS"):
+        # center the data by subject before PLS to remove subject effects
+        X_train_c = subject_center(X_train, subj_label_train, mode="zscore")
+        if X_test is not None:
+            X_test_c = subject_center(X_test, subj_label_test, mode="zscore")
         else:
-            pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
-        pca.fit(X_train)
-        X_train_embed = pca.transform(X_train)
+            X_test_c = None
+        scaler = StandardScaler(with_mean=True, with_std=True)
+        X_train_preproc = scaler.fit_transform(X_train_c)
         if X_test is not None:
-            X_test_embed = pca.transform(X_test)
+            X_test_preproc = scaler.transform(X_test_c)
         else:
-            X_test_embed = None
-    elif embedding == "PLS":
-        # center the data by subject before PLS to remove subject effects
-        X_train_c = subject_center(X_train, subj_label_train, mode="zscore")
-        X_test_c = subject_center(X_test, subj_label_test, mode="zscore")
-        # if n_components is not specified, select it using subject-aware CV on the training set
+            X_test_preproc = None
+
         if n_components == "auto":
             if y_continuous:
-                best_n, _ = select_pls_components_continuous_groupcv(
-                    X=X_train_c,
+                best_n, _ = select_num_components_continuous_groupcv(
+                    X=X_train_preproc,
                     y=y_train,
                     groups=subj_label_train,
+                    embedding_method=embedding,
                     n_list=[
                         2,
                         3,
@@ -1425,10 +1437,11 @@ def embed_dFC_features(
                     score="r2",
                 )
             else:
-                best_n, _ = select_pls_components_binary_groupcv(
-                    X=X_train_c,
+                best_n, _ = select_num_components_binary_groupcv(
+                    X=X_train_preproc,
                     y=y_train,
                     groups=subj_label_train,
+                    embedding_method=embedding,
                     n_list=[
                         2,
                         3,
@@ -1446,11 +1459,23 @@ def embed_dFC_features(
                 )
             n_components = best_n
 
+    if embedding == "PCA":
+        pca = PCA(n_components=n_components, svd_solver="full", whiten=False)
+        pca.fit(X_train_preproc)
+        X_train_embed = pca.transform(X_train_preproc)
+        if X_test is not None:
+            X_test_embed = pca.transform(X_test_preproc)
+        else:
+            X_test_embed = None
+    elif embedding == "PLS":
         pls = PLSEmbedder(n_components=n_components, scale=True)
         # fit on train set
-        X_train_embed = pls.fit_transform(X_train_c, y_train)
+        X_train_embed = pls.fit_transform(X_train_preproc, y_train)
         # only transform test set
-        X_test_embed = pls.transform(X_test_c)
+        if X_test is not None:
+            X_test_embed = pls.transform(X_test_preproc)
+        else:
+            X_test_embed = None
     elif embedding == "LE":
         # if the dFC features are not unique (state-based), set the LE_embedding_method to "concat+embed"
         if measure_is_state_based:

From 6af24d555526764821d794b46feccdfeb2707236 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Feb 2026 21:33:07 -0500
Subject: [PATCH 341/401] improve errors in LE

---
 pydfc/ml_utils.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index da68d37..666e431 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -887,6 +887,10 @@ def find_intrinsic_dim(
                 int(0.3 * X_subj.shape[0]),
                 5,
             ):
+                if k == 1:
+                    print(
+                        f"Warning: k=1 is not valid for localpca_intrinsic_dim. Skipping k=1 for subject {X_subj.shape[0]} {max(5, int(0.1 * X_subj.shape[0]))}."
+                    )
                 try:
                     d_global, _ = localpca_intrinsic_dim(
                         X_subj,
@@ -897,13 +901,21 @@ def find_intrinsic_dim(
                         metric="correlation",
                         agg="median",
                     )
-                    intrinsic_dim_diff_k.append(d_global)
+                    if np.isfinite(d_global) and d_global >= 1:
+                        intrinsic_dim_diff_k.append(d_global)
                 except Exception as e:
                     warnings.warn(
                         f"Error in localpca_intrinsic_dim for subject {subject} with k={k}: {e}."
                     )
                     continue
+            if len(intrinsic_dim_diff_k) == 0:
+                warnings.warn(
+                    f"No valid intrinsic dimensions found for subject {subject}."
+                )
+                continue
             intrinsic_dim_all.append(int(np.mean(intrinsic_dim_diff_k)))
+        if len(intrinsic_dim_all) == 0:
+            raise ValueError("No valid intrinsic dimensions found for any subject.")
         intrinsic_dim = int(np.median(intrinsic_dim_all))
     return intrinsic_dim
 

From 29335223829c4418e9b136553a0d4b4d1c498f6b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Feb 2026 22:50:37 -0500
Subject: [PATCH 342/401] minor

---
 pydfc/ml_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 666e431..cf47a07 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -889,8 +889,9 @@ def find_intrinsic_dim(
             ):
                 if k == 1:
                     print(
-                        f"Warning: k=1 is not valid for localpca_intrinsic_dim. Skipping k=1 for subject {X_subj.shape[0]} {max(5, int(0.1 * X_subj.shape[0]))}."
+                        f"Warning: k=1 is not valid for localpca_intrinsic_dim. Skipping k=1 for subject {subject}."
                     )
+                    continue
                 try:
                     d_global, _ = localpca_intrinsic_dim(
                         X_subj,

From 1783303887599bb1e94d808be52ea334f1d3f58a Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Feb 2026 22:51:08 -0500
Subject: [PATCH 343/401] minor

---
 pydfc/ml_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index cf47a07..c9ae682 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -888,7 +888,7 @@ def find_intrinsic_dim(
                 5,
             ):
                 if k == 1:
-                    print(
+                    warnings.warn(
                         f"Warning: k=1 is not valid for localpca_intrinsic_dim. Skipping k=1 for subject {subject}."
                     )
                     continue

From e67ba04a4814d69f4a1d9a85b5e335323d61e9b0 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 14 Feb 2026 18:32:04 -0500
Subject: [PATCH 344/401] remove CIC from analysis

---
 task_dFC/run_scripts_slurm/multi_dataset_info.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
index 7f3e98c..fd4b892 100644
--- a/task_dFC/run_scripts_slurm/multi_dataset_info.json
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -4,14 +4,14 @@
 		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
 		"DATASETS": [
 			"ds001242", "ds002236", "ds002647",
-			"ds002843", "ds002994", "ds003242",
+			"ds002843", "ds002994",
 			"ds003465", "ds003612", "ds003823",
 			"ds004044", "ds004349", "ds004359",
 			"ds004556", "ds004746", "ds004791",
 			"ds004848", "ds005038"
 		],
 		"TASKS_to_include": [
-			"task-arithmetic", "task-AudSem", "task-Axcpt", "task-CIC",
+			"task-arithmetic", "task-AudSem", "task-Axcpt",
 			"task-Cuedts", "task-emotionRegulation", "task-execution","task-expo",
 			"task-fearlearning", "task-feedback", "task-fribBids", "task-IHG",
 			"task-imagery", "task-itc", "task-localiser", "task-Localizer",

From 8e76e96a6f2eb1a6523c07b0de59d9a8cff29024 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 24 Feb 2026 15:45:07 -0500
Subject: [PATCH 345/401] put embedding in CV

---
 pydfc/ml_utils.py | 149 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 111 insertions(+), 38 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index c9ae682..165626b 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -10,7 +10,7 @@
 
 import numpy as np
 from scipy.spatial import procrustes
-from sklearn.base import clone
+from sklearn.base import BaseEstimator, TransformerMixin, clone
 from sklearn.cluster import KMeans
 from sklearn.cross_decomposition import PLSRegression
 from sklearn.decomposition import PCA
@@ -19,7 +19,6 @@
 from sklearn.manifold import SpectralEmbedding
 from sklearn.metrics import (
     accuracy_score,
-    adjusted_rand_score,
     average_precision_score,
     balanced_accuracy_score,
     confusion_matrix,
@@ -37,7 +36,7 @@
     StratifiedKFold,
 )
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, kneighbors_graph
-from sklearn.pipeline import make_pipeline
+from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC, SVR
 from sklearn.utils import shuffle
@@ -1161,60 +1160,57 @@ def rows_look_redundant(X, sample=100):
     return (len(h) - len(set(h))) / len(h) > 0.5
 
 
-class PLSEmbedder:
+class PLSEmbedder(BaseEstimator, TransformerMixin):
     """
-    Supervised dimensionality reduction using PLSRegression for binary labels.
-    Produces low-dim 'scores' features for downstream classifiers.
+    Supervised dimensionality reduction using PLSRegression.
+    Returns X scores (latent components) for downstream models.
 
-    Usage:
-        pls = PLSEmbedder(n_components=10)
-        Z_train = pls.fit_transform(X_train, y_train)
-        Z_test  = pls.transform(X_test)
+    Notes:
+    - Works for binary y (0/1) and also continuous y (regression-style PLS).
+    - For classification, y should typically be 0/1 or {-1,1}.
     """
 
-    def __init__(self, n_components=10, scale=True):
-        self.n_components = int(n_components)
-        self.scale = bool(scale)
-
-        self.scaler_ = None
-        self.model_ = None
+    def __init__(self, n_components=10, scale=False):
+        self.n_components = n_components
+        self.scale = scale
 
     def fit(self, X, y):
         X = np.asarray(X)
-        y = np.asarray(y)
+        y = np.asarray(y).ravel().reshape(-1, 1)
 
-        if y.ndim == 1:
-            y = y.reshape(-1, 1)
-        elif y.ndim == 2:
-            if y.shape[0] != X.shape[0]:
-                raise ValueError(f"y has shape {y.shape} but X has shape {X.shape}.")
-        else:
-            raise ValueError("y must be 1D or 2D.")
+        if X.shape[0] != y.shape[0]:
+            raise ValueError(f"X has {X.shape[0]} rows but y has {y.shape[0]}.")
 
+        # optional internal scaling (usually OFF if pipeline already scales)
         if self.scale:
             self.scaler_ = StandardScaler(with_mean=True, with_std=True)
             Xs = self.scaler_.fit_transform(X)
         else:
+            self.scaler_ = None
             Xs = X
 
-        self.model_ = PLSRegression(n_components=self.n_components, scale=False)
+        # safety: cap n_components for this fold
+        nmax = min(Xs.shape[0] - 1, Xs.shape[1])
+        ncomp = int(self.n_components)
+        if ncomp > nmax:
+            raise ValueError(
+                f"n_components={ncomp} is too large for fold with "
+                f"n_samples={Xs.shape[0]}, n_features={Xs.shape[1]} (max {nmax})."
+            )
+
+        self.model_ = PLSRegression(n_components=ncomp, scale=False)
         self.model_.fit(Xs, y)
         return self
 
-    def fit_transform(self, X, y):
-        self.fit(X, y)
-        return self.transform(X)
-
     def transform(self, X):
-        if self.model_ is None:
+        if not hasattr(self, "model_"):
             raise RuntimeError("PLSEmbedder is not fitted yet.")
-        X = np.asarray(X)
 
-        Xs = self.scaler_.transform(X) if self.scale else X
+        X = np.asarray(X)
+        Xs = self.scaler_.transform(X) if self.scaler_ is not None else X
 
-        # PLS scores (latent components)
-        # sklearn exposes x_scores_ only for training; for new data:
-        Z = Xs @ self.model_.x_rotations_  # (n_samples, n_components)
+        # Out-of-sample scores
+        Z = Xs @ self.model_.x_rotations_
         return Z.astype(np.float32, copy=False)
 
 
@@ -1669,22 +1665,90 @@ def logistic_regression_classify(X_train, y_train, X_test, y_test, subj_label_tr
     return RESULT
 
 
-def SVM_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
+def SVM_classify(
+    X_train,
+    y_train,
+    X_test,
+    y_test,
+    subj_label_train=None,
+    embedding_method="PCA",
+):
+    if embedding_method == "PCA":
+        emb = PCA(whiten=False, svd_solver="full", random_state=0)
+    elif embedding_method == "PLS":
+        emb = PLSEmbedder(scale=False)  # IMPORTANT: avoid double scaling
+    else:
+        raise ValueError("embedding_method must be 'PCA' or 'PLS'.")
+
+    pipe = Pipeline(
+        [
+            ("scaler", StandardScaler(with_mean=True, with_std=True)),
+            ("emb", emb),
+            ("svc", SVC(kernel="rbf")),
+        ]
+    )
+
+    # Grid (keep small!)
+    param_grid = {
+        "emb__n_components": [5, 10, 20, 30, 50, 100],
+        "svc__C": [0.1, 1, 10],
+        "svc__gamma": ["scale", 0.01, 0.1],
+    }
+
+    # CV splitter
+    if subj_label_train is None:
+        Xs, ys = shuffle(X_train, y_train, random_state=0)
+        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
+        fit_kwargs = {}
+    else:
+        Xs, ys, gs = shuffle(X_train, y_train, subj_label_train, random_state=0)
+        cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=0)
+        fit_kwargs = {"groups": gs}
+
+    # GridSearch on training subjects only
+    gscv = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1, scoring="balanced_accuracy")
+    gscv.fit(Xs, ys, **fit_kwargs)
+
+    # Evaluate with best estimator (already refit on full training set by default)
+    model = gscv.best_estimator_
+
+    RESULT = get_classification_results(
+        X_train=X_train,
+        X_test=X_test,
+        y_train=y_train,
+        y_test=y_test,
+        classifier_model=model,
+    )
+    RESULT["best_params"] = gscv.best_params_
+    return RESULT
+
+
+def SVM_classify(
+    X_train, y_train, X_test, y_test, subj_label_train=None, embedding_method="PCA"
+):
     """
     SVM classification
 
     provide subj_label_train if you want to use StratifiedGroupKFold
     to ensure that the same subject is not in both train and test sets
     """
+    if embedding_method == "PCA":
+        grid_embedding_name = "pca__n_components"
+        embedding_model = PCA(whiten=False, svd_solver="full")
+    elif embedding_method == "PLS":
+        grid_embedding_name = "pls__n_components"
+        embedding_model = PLSEmbedder(scale=True)
     # define the parameter grid
     param_grid = {
-        "svc__C": [0.01, 0.1, 1, 10],
-        "svc__gamma": ["scale", 0.01, 0.05, 0.1],
+        grid_embedding_name: [5, 10, 20, 30, 50, 100],
+        "svc__C": [0.1, 1, 10],
+        "svc__gamma": ["scale", 0.01, 0.1],
     }
 
     # perform grid search
     model_for_hyperparam = make_pipeline(
         StandardScaler(),
+        embedding_model,
         SVC(kernel="rbf"),
     )
     # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
@@ -1704,11 +1768,20 @@ def SVM_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
         model_gscv.fit(
             X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled
         )
+    n_components = model_gscv.best_params_[grid_embedding_name]
     C = model_gscv.best_params_["svc__C"]
     gamma = model_gscv.best_params_["svc__gamma"]
 
+    if embedding_method == "PCA":
+        embedding_model_final = PCA(
+            n_components=n_components, whiten=False, svd_solver="full"
+        )
+    elif embedding_method == "PLS":
+        embedding_model_final = PLSEmbedder(n_components=n_components, scale=True)
+
     model = make_pipeline(
         StandardScaler(),
+        embedding_model_final,
         SVC(kernel="rbf", C=C, gamma=gamma),
     )
 

From bbc1fd0e1b7144e73ecac0d5fe866b69d8bd1425 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 24 Feb 2026 16:43:08 -0500
Subject: [PATCH 346/401] add ts2vec

---
 .../train_ts2vec_dfc_embeddings.py            | 734 ++++++++++++++++++
 1 file changed, 734 insertions(+)
 create mode 100644 task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py

diff --git a/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py b/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py
new file mode 100644
index 0000000..f594b0b
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py
@@ -0,0 +1,734 @@
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+import numpy as np
+import pandas as pd
+
+from pydfc.ml_utils import (
+    dFC_feature_extraction_subj_lvl,
+    find_available_subjects,
+    load_dFC,
+    load_task_data,
+)
+
+
+def str2bool(v: Any) -> bool:
+    if isinstance(v, bool):
+        return v
+    s = str(v).strip().lower()
+    if s in {"1", "true", "t", "yes", "y"}:
+        return True
+    if s in {"0", "false", "f", "no", "n"}:
+        return False
+    raise argparse.ArgumentTypeError(f"Invalid boolean value: {v}")
+
+
+def parse_json_arg(value: Optional[str]) -> Dict[str, Any]:
+    if value is None or value.strip() == "":
+        return {}
+    parsed = json.loads(value)
+    if not isinstance(parsed, dict):
+        raise ValueError("JSON argument must be an object/dict.")
+    return parsed
+
+
+def normalize_optional_token(value: Optional[str]) -> Optional[str]:
+    if value is None:
+        return None
+    if value in {"None", "none", "null"}:
+        return None
+    return value
+
+
+def choose_subjects(
+    subjects: Sequence[str],
+    max_subjects_per_scan: Optional[int],
+    rng: np.random.Generator,
+) -> List[str]:
+    subjects = sorted(list(subjects))
+    if max_subjects_per_scan is None or len(subjects) <= max_subjects_per_scan:
+        return subjects
+    idx = rng.choice(len(subjects), size=max_subjects_per_scan, replace=False)
+    idx = np.sort(idx)
+    return [subjects[i] for i in idx]
+
+
+def load_multi_dataset_spec(
+    multi_dataset_info_path: str, simul_or_real: str
+) -> Tuple[Dict[str, Any], str, List[str], List[str]]:
+    with open(multi_dataset_info_path, "r") as f:
+        multi_dataset_info = json.load(f)
+
+    if simul_or_real == "real":
+        spec = multi_dataset_info["real_data"]
+    elif simul_or_real == "simulated":
+        spec = multi_dataset_info["simulated_data"]
+    else:
+        raise ValueError("--simul_or_real must be 'real' or 'simulated'")
+
+    main_root = spec["main_root"]
+    datasets = list(spec["DATASETS"])
+    tasks_to_include = list(spec["TASKS_to_include"])
+    return multi_dataset_info, main_root, datasets, tasks_to_include
+
+
+def load_dataset_info(
+    dataset_info_file: str,
+) -> Tuple[List[Optional[str]], List[str], Dict[str, List[Optional[str]]]]:
+    with open(dataset_info_file, "r") as f:
+        dataset_info = json.load(f)
+
+    sessions = dataset_info.get("SESSIONS", None)
+    if sessions is None:
+        sessions = [None]
+
+    tasks = dataset_info["TASKS"]
+
+    runs = dataset_info.get("RUNS", None)
+    if runs is None:
+        runs = {task: [None] for task in tasks}
+    else:
+        runs = {
+            task: (runs[task] if runs[task] is not None else [None]) for task in tasks
+        }
+
+    return sessions, tasks, runs
+
+
+def prepare_ts2vec_input(
+    sequences: Sequence[np.ndarray],
+    seq_len_mode: str,
+    pad_value: float,
+    target_seq_len: Optional[int] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    if len(sequences) == 0:
+        raise ValueError("No sequences provided.")
+
+    lengths = np.array([seq.shape[0] for seq in sequences], dtype=np.int32)
+    feature_dims = {seq.shape[1] for seq in sequences}
+    if len(feature_dims) != 1:
+        raise ValueError(f"Inconsistent feature dimensions found: {sorted(feature_dims)}")
+    feat_dim = next(iter(feature_dims))
+
+    if target_seq_len is None:
+        if seq_len_mode == "truncate_min":
+            target_seq_len = int(lengths.min())
+        elif seq_len_mode == "pad_max":
+            target_seq_len = int(lengths.max())
+        else:
+            raise ValueError(f"Unknown seq_len_mode: {seq_len_mode}")
+    target_seq_len = int(target_seq_len)
+    if target_seq_len <= 0:
+        raise ValueError("target_seq_len must be positive.")
+
+    X = np.full((len(sequences), target_seq_len, feat_dim), pad_value, dtype=np.float32)
+    for i, seq in enumerate(sequences):
+        seq = seq.astype(np.float32, copy=False)
+        if seq.shape[0] >= target_seq_len:
+            X[i] = seq[:target_seq_len, :]
+        else:
+            X[i, : seq.shape[0], :] = seq
+
+    return X, lengths
+
+
+def standardize_ts2vec_input(
+    X: np.ndarray, eps: float = 1e-6
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    mean = X.mean(axis=(0, 1), keepdims=True)
+    std = X.std(axis=(0, 1), keepdims=True)
+    std = np.where(std < eps, 1.0, std)
+    Xz = (X - mean) / std
+    return Xz.astype(np.float32, copy=False), mean.squeeze(), std.squeeze()
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Load dFC feature sequences across multiple datasets and train a TS2Vec "
+            "model to learn embeddings."
+        )
+    )
+    parser.add_argument(
+        "--multi_dataset_info",
+        type=str,
+        required=True,
+        help="Path to task_dFC/run_scripts_slurm/multi_dataset_info.json",
+    )
+    parser.add_argument(
+        "--simul_or_real",
+        type=str,
+        required=True,
+        choices=["real", "simulated"],
+        help="Which section of the multi-dataset config to use.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output directory. Defaults to <multi_dataset_info.output_root>/TS2Vec/<simul_or_real>.",
+    )
+    parser.add_argument(
+        "--dFC_ids",
+        type=int,
+        nargs="+",
+        required=True,
+        help="One or more dFC method IDs to process. A separate TS2Vec model is trained per compatible group.",
+    )
+    parser.add_argument(
+        "--datasets",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Optional subset of dataset IDs to include.",
+    )
+    parser.add_argument(
+        "--tasks",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Optional subset of task labels to include (e.g., task-Axcpt).",
+    )
+    parser.add_argument(
+        "--sessions",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Optional subset of session labels to include.",
+    )
+    parser.add_argument(
+        "--runs",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Optional subset of run labels to include.",
+    )
+    parser.add_argument(
+        "--dynamic_pred",
+        type=str,
+        default="no",
+        choices=["no", "past", "past_and_future"],
+        help="Feature stacking mode reused from pydfc.ml_utils.dFC_feature_extraction_subj_lvl.",
+    )
+    parser.add_argument(
+        "--normalize_dFC",
+        type=str2bool,
+        default=True,
+        help="Apply rank normalization to state-free dFC matrices before vectorization.",
+    )
+    parser.add_argument(
+        "--FCS_proba_for_SB",
+        type=str2bool,
+        default=True,
+        help="For state-based dFC, use FCS probabilities instead of vectorized dFC matrices.",
+    )
+    parser.add_argument(
+        "--min_seq_len",
+        type=int,
+        default=10,
+        help="Minimum sequence length (TRs) after feature extraction.",
+    )
+    parser.add_argument(
+        "--max_subjects_per_scan",
+        type=int,
+        default=None,
+        help="Randomly subsample subjects per (dataset, session, task, run, dFC_id).",
+    )
+    parser.add_argument(
+        "--max_total_sequences",
+        type=int,
+        default=None,
+        help="Optional global cap on number of sequences per TS2Vec training group.",
+    )
+    parser.add_argument(
+        "--seq_len_mode",
+        type=str,
+        default="truncate_min",
+        choices=["truncate_min", "pad_max"],
+        help="How to make variable-length sequences compatible for TS2Vec input.",
+    )
+    parser.add_argument(
+        "--target_seq_len",
+        type=int,
+        default=None,
+        help="Override sequence length used for training input (truncate/pad to this length).",
+    )
+    parser.add_argument(
+        "--pad_value",
+        type=float,
+        default=0.0,
+        help="Padding value when --seq_len_mode=pad_max or --target_seq_len exceeds sequence length.",
+    )
+    parser.add_argument(
+        "--standardize_features",
+        type=str2bool,
+        default=False,
+        help="Z-score features globally across sequences and timepoints before TS2Vec training.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, help="Random seed for subsampling."
+    )
+
+    # TS2Vec common args (kept optional and overridable via JSON)
+    parser.add_argument(
+        "--device", type=str, default=None, help="TS2Vec device (e.g., cpu, cuda)."
+    )
+    parser.add_argument(
+        "--output_dims", type=int, default=320, help="TS2Vec output embedding dimension."
+    )
+    parser.add_argument(
+        "--hidden_dims", type=int, default=64, help="TS2Vec hidden dimension."
+    )
+    parser.add_argument("--depth", type=int, default=10, help="TS2Vec encoder depth.")
+    parser.add_argument(
+        "--batch_size", type=int, default=8, help="TS2Vec fit batch size."
+    )
+    parser.add_argument(
+        "--lr", type=float, default=1e-3, help="TS2Vec fit learning rate (if supported)."
+    )
+    parser.add_argument(
+        "--max_train_length",
+        type=int,
+        default=None,
+        help="TS2Vec max_train_length (if supported).",
+    )
+    parser.add_argument(
+        "--temporal_unit",
+        type=int,
+        default=0,
+        help="TS2Vec temporal_unit (if supported).",
+    )
+    parser.add_argument(
+        "--n_epochs", type=int, default=50, help="Number of TS2Vec training epochs."
+    )
+
+    parser.add_argument(
+        "--ts2vec_init_json",
+        type=str,
+        default=None,
+        help="Extra JSON object of kwargs for TS2Vec(...) init. Overrides common args on key conflict.",
+    )
+    parser.add_argument(
+        "--ts2vec_fit_json",
+        type=str,
+        default=None,
+        help="Extra JSON object of kwargs for model.fit(...). Overrides common args on key conflict.",
+    )
+    parser.add_argument(
+        "--ts2vec_encode_json",
+        type=str,
+        default=None,
+        help="Extra JSON object of kwargs for model.encode(...).",
+    )
+
+    parser.add_argument(
+        "--encoding_window",
+        type=str,
+        default="full_series",
+        help="TS2Vec encode encoding_window. Use integer string for numeric window or full_series.",
+    )
+    parser.add_argument(
+        "--save_timestep_embeddings",
+        type=str2bool,
+        default=False,
+        help="Also save per-timestep embeddings (can be large).",
+    )
+    parser.add_argument(
+        "--save_model",
+        type=str2bool,
+        default=True,
+        help="Try to save the TS2Vec model if the package exposes model.save(...).",
+    )
+
+    return parser
+
+
+def instantiate_ts2vec(
+    TS2Vec: Any, init_kwargs: Dict[str, Any]
+) -> Tuple[Any, Dict[str, Any]]:
+    """
+    Try a few progressively smaller init signatures to tolerate TS2Vec package variants.
+    Returns (model, effective_init_kwargs).
+    """
+    candidate_kwargs = [dict(init_kwargs)]
+    optional_drop_order = ["temporal_unit", "max_train_length", "device"]
+    current = dict(init_kwargs)
+    for key in optional_drop_order:
+        if key in current:
+            current = dict(current)
+            current.pop(key, None)
+            candidate_kwargs.append(current)
+
+    last_error = None
+    for kwargs in candidate_kwargs:
+        try:
+            return TS2Vec(**kwargs), kwargs
+        except TypeError as e:
+            last_error = e
+            continue
+
+    raise TypeError(
+        f"Could not instantiate TS2Vec with tested kwargs variants: {last_error}"
+    )
+
+
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+
+    rng = np.random.default_rng(args.seed)
+    multi_dataset_info, main_root, datasets, tasks_to_include = load_multi_dataset_spec(
+        args.multi_dataset_info, args.simul_or_real
+    )
+
+    if args.datasets:
+        datasets = [d for d in datasets if d in set(args.datasets)]
+    task_filter = set(args.tasks) if args.tasks else set(tasks_to_include)
+    session_filter = (
+        {normalize_optional_token(x) for x in args.sessions} if args.sessions else None
+    )
+    run_filter = {normalize_optional_token(x) for x in args.runs} if args.runs else None
+
+    if args.output_dir is None:
+        output_root = f"{multi_dataset_info['output_root']}/TS2Vec/{args.simul_or_real}"
+    else:
+        output_root = args.output_dir
+    Path(output_root).mkdir(parents=True, exist_ok=True)
+
+    # group key -> payload
+    grouped_sequences: Dict[Tuple[int, str, int], List[np.ndarray]] = {}
+    grouped_targets: Dict[Tuple[int, str, int], List[np.ndarray]] = {}
+    grouped_meta: Dict[Tuple[int, str, int], List[Dict[str, Any]]] = {}
+    skipped_records: List[Dict[str, Any]] = []
+
+    total_loaded = 0
+    print(f"Datasets to process: {datasets}")
+    for dataset in datasets:
+        dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
+        roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
+        dFC_root = f"{main_root}/{dataset}/derivatives/dFC_assessed"
+
+        if not os.path.exists(dataset_info_file):
+            print(
+                f"Skipping dataset {dataset}: dataset_info.json not found at {dataset_info_file}"
+            )
+            continue
+
+        sessions, tasks, runs_map = load_dataset_info(dataset_info_file)
+        if session_filter is not None:
+            sessions = [s for s in sessions if s in session_filter]
+
+        for session in sessions:
+            for task in tasks:
+                if task not in task_filter:
+                    continue
+                runs = runs_map.get(task, [None])
+                if run_filter is not None:
+                    runs = [r for r in runs if r in run_filter]
+
+                for run in runs:
+                    for dFC_id in args.dFC_ids:
+                        try:
+                            subjects = find_available_subjects(
+                                dFC_root=dFC_root,
+                                task=task,
+                                run=run,
+                                session=session,
+                                dFC_id=dFC_id,
+                            )
+                        except FileNotFoundError:
+                            print(f"Skipping missing dFC directory: {dFC_root}")
+                            continue
+
+                        if len(subjects) == 0:
+                            continue
+                        subjects = choose_subjects(
+                            subjects=subjects,
+                            max_subjects_per_scan=args.max_subjects_per_scan,
+                            rng=rng,
+                        )
+
+                        print(
+                            "Loading "
+                            f"dataset={dataset} session={session} task={task} run={run} "
+                            f"dFC_id={dFC_id} n_subjects={len(subjects)}"
+                        )
+                        for subj in subjects:
+                            try:
+                                dFC = load_dFC(
+                                    dFC_root=dFC_root,
+                                    subj=subj,
+                                    task=task,
+                                    dFC_id=dFC_id,
+                                    run=run,
+                                    session=session,
+                                )
+                                task_data = load_task_data(
+                                    roi_root=roi_root,
+                                    subj=subj,
+                                    task=task,
+                                    run=run,
+                                    session=session,
+                                )
+                                X_subj, y_subj = dFC_feature_extraction_subj_lvl(
+                                    dFC=dFC,
+                                    task_data=task_data,
+                                    dynamic_pred=args.dynamic_pred,
+                                    normalize_dFC=args.normalize_dFC,
+                                    FCS_proba_for_SB=args.FCS_proba_for_SB,
+                                )
+                            except Exception as e:
+                                skipped_records.append(
+                                    {
+                                        "dataset": dataset,
+                                        "session": session,
+                                        "task": task,
+                                        "run": run,
+                                        "dFC_id": dFC_id,
+                                        "subject": subj,
+                                        "reason": f"{type(e).__name__}: {e}",
+                                    }
+                                )
+                                continue
+
+                            if X_subj.shape[0] < args.min_seq_len:
+                                skipped_records.append(
+                                    {
+                                        "dataset": dataset,
+                                        "session": session,
+                                        "task": task,
+                                        "run": run,
+                                        "dFC_id": dFC_id,
+                                        "subject": subj,
+                                        "reason": f"seq_too_short({X_subj.shape[0]}<{args.min_seq_len})",
+                                    }
+                                )
+                                continue
+
+                            measure_name = dFC.measure.measure_name
+                            group_key = (dFC_id, measure_name, int(X_subj.shape[1]))
+                            grouped_sequences.setdefault(group_key, []).append(X_subj)
+                            grouped_targets.setdefault(group_key, []).append(y_subj)
+                            grouped_meta.setdefault(group_key, []).append(
+                                {
+                                    "dataset": dataset,
+                                    "session": session,
+                                    "task": task,
+                                    "run": run,
+                                    "dFC_id": dFC_id,
+                                    "subject": subj,
+                                    "measure_name": measure_name,
+                                    "seq_len_raw": int(X_subj.shape[0]),
+                                    "feature_dim": int(X_subj.shape[1]),
+                                    "task_presence_mean": float(np.mean(y_subj)),
+                                }
+                            )
+                            total_loaded += 1
+
+    print(f"Loaded sequences: {total_loaded}")
+    if total_loaded == 0:
+        raise RuntimeError("No sequences were loaded. Check filters/paths/dFC_ids.")
+
+    # Lazy import to avoid making this script unusable when ts2vec is not installed.
+    try:
+        from ts2vec import TS2Vec  # type: ignore
+    except ImportError as e:
+        raise ImportError(
+            "TS2Vec package is not installed. Install a compatible implementation "
+            "(commonly `pip install ts2vec`) and rerun."
+        ) from e
+
+    ts2vec_init_extra = parse_json_arg(args.ts2vec_init_json)
+    ts2vec_fit_extra = parse_json_arg(args.ts2vec_fit_json)
+    ts2vec_encode_extra = parse_json_arg(args.ts2vec_encode_json)
+
+    encoding_window: Any = args.encoding_window
+    if isinstance(encoding_window, str) and encoding_window.isdigit():
+        encoding_window = int(encoding_window)
+
+    run_summaries: List[Dict[str, Any]] = []
+
+    for group_key in sorted(grouped_sequences.keys(), key=lambda x: (x[0], x[1], x[2])):
+        dFC_id, measure_name, feature_dim = group_key
+        sequences = grouped_sequences[group_key]
+        targets = grouped_targets[group_key]
+        meta_rows = grouped_meta[group_key]
+
+        if (
+            args.max_total_sequences is not None
+            and len(sequences) > args.max_total_sequences
+        ):
+            idx = rng.choice(len(sequences), size=args.max_total_sequences, replace=False)
+            idx = np.sort(idx)
+            sequences = [sequences[i] for i in idx]
+            targets = [targets[i] for i in idx]
+            meta_rows = [meta_rows[i] for i in idx]
+
+        if len(sequences) < 2:
+            print(
+                f"Skipping group dFC_id={dFC_id}, measure={measure_name}, feat_dim={feature_dim}: "
+                "need at least 2 sequences for training."
+            )
+            continue
+
+        X_ts2vec, raw_lengths = prepare_ts2vec_input(
+            sequences=sequences,
+            seq_len_mode=args.seq_len_mode,
+            pad_value=args.pad_value,
+            target_seq_len=args.target_seq_len,
+        )
+
+        feature_mean = None
+        feature_std = None
+        if args.standardize_features:
+            X_ts2vec, feature_mean, feature_std = standardize_ts2vec_input(X_ts2vec)
+
+        print(
+            f"Training TS2Vec on group dFC_id={dFC_id}, measure={measure_name}, "
+            f"X.shape={X_ts2vec.shape}"
+        )
+
+        init_kwargs: Dict[str, Any] = {
+            "input_dims": int(feature_dim),
+            "output_dims": int(args.output_dims),
+            "hidden_dims": int(args.hidden_dims),
+            "depth": int(args.depth),
+        }
+        if args.device is not None:
+            init_kwargs["device"] = args.device
+        if args.max_train_length is not None:
+            init_kwargs["max_train_length"] = int(args.max_train_length)
+        if args.temporal_unit is not None:
+            init_kwargs["temporal_unit"] = int(args.temporal_unit)
+        init_kwargs.update(ts2vec_init_extra)
+
+        fit_kwargs: Dict[str, Any] = {
+            "n_epochs": int(args.n_epochs),
+            "batch_size": int(args.batch_size),
+            "lr": float(args.lr),
+        }
+        fit_kwargs.update(ts2vec_fit_extra)
+
+        model, effective_init_kwargs = instantiate_ts2vec(TS2Vec, init_kwargs)
+        try:
+            _ = model.fit(X_ts2vec, **fit_kwargs)
+        except TypeError:
+            # Different TS2Vec implementations expose different fit signatures.
+            fit_kwargs_fallback = dict(fit_kwargs)
+            fit_kwargs_fallback.pop("lr", None)
+            _ = model.fit(X_ts2vec, **fit_kwargs_fallback)
+            fit_kwargs = fit_kwargs_fallback
+
+        encode_kwargs: Dict[str, Any] = {"encoding_window": encoding_window}
+        encode_kwargs.update(ts2vec_encode_extra)
+        full_series_embeddings = model.encode(X_ts2vec, **encode_kwargs)
+
+        timestep_embeddings = None
+        if args.save_timestep_embeddings:
+            timestep_embeddings = model.encode(X_ts2vec)
+
+        safe_measure = str(measure_name).replace(" ", "_")
+        group_dir = Path(output_root) / f"dFC_{dFC_id}_{safe_measure}_feat{feature_dim}"
+        group_dir.mkdir(parents=True, exist_ok=True)
+
+        meta_df = pd.DataFrame(meta_rows).copy()
+        meta_df["seq_len_used"] = int(X_ts2vec.shape[1])
+        meta_df["seq_len_raw"] = raw_lengths
+        meta_df.to_csv(group_dir / "sequence_metadata.csv", index=False)
+
+        np.save(
+            group_dir / "full_series_embeddings.npy", np.asarray(full_series_embeddings)
+        )
+        np.save(group_dir / "train_sequences_input.npy", X_ts2vec)
+        np.save(
+            group_dir / "task_presence_labels.npy",
+            np.array(targets, dtype=object),
+            allow_pickle=True,
+        )
+        if timestep_embeddings is not None:
+            np.save(
+                group_dir / "timestep_embeddings.npy", np.asarray(timestep_embeddings)
+            )
+        if feature_mean is not None and feature_std is not None:
+            np.save(group_dir / "feature_mean.npy", np.asarray(feature_mean))
+            np.save(group_dir / "feature_std.npy", np.asarray(feature_std))
+
+        model_saved = False
+        if args.save_model and hasattr(model, "save"):
+            try:
+                model.save(str(group_dir / "ts2vec_model"))
+                model_saved = True
+            except Exception as e:
+                print(f"Could not save TS2Vec model for group {group_key}: {e}")
+
+        config_to_save = {
+            "group_key": {
+                "dFC_id": int(dFC_id),
+                "measure_name": str(measure_name),
+                "feature_dim": int(feature_dim),
+            },
+            "data": {
+                "n_sequences": int(len(sequences)),
+                "seq_len_mode": args.seq_len_mode,
+                "target_seq_len": int(X_ts2vec.shape[1]),
+                "raw_seq_len_min": int(raw_lengths.min()),
+                "raw_seq_len_max": int(raw_lengths.max()),
+                "standardize_features": bool(args.standardize_features),
+            },
+            "loader_params": {
+                "simul_or_real": args.simul_or_real,
+                "datasets": datasets,
+                "task_filter": sorted(list(task_filter)),
+                "session_filter": (
+                    None if session_filter is None else sorted(list(session_filter))
+                ),
+                "run_filter": None if run_filter is None else sorted(list(run_filter)),
+                "dFC_ids": [int(x) for x in args.dFC_ids],
+                "dynamic_pred": args.dynamic_pred,
+                "normalize_dFC": bool(args.normalize_dFC),
+                "FCS_proba_for_SB": bool(args.FCS_proba_for_SB),
+                "min_seq_len": int(args.min_seq_len),
+                "max_subjects_per_scan": args.max_subjects_per_scan,
+                "max_total_sequences": args.max_total_sequences,
+                "seed": int(args.seed),
+            },
+            "ts2vec": {
+                "init_kwargs": effective_init_kwargs,
+                "fit_kwargs": fit_kwargs,
+                "encode_kwargs": encode_kwargs,
+                "model_saved": model_saved,
+            },
+        }
+        with open(group_dir / "run_config.json", "w") as f:
+            json.dump(config_to_save, f, indent=2)
+
+        run_summaries.append(
+            {
+                "dFC_id": int(dFC_id),
+                "measure_name": str(measure_name),
+                "feature_dim": int(feature_dim),
+                "n_sequences": int(len(sequences)),
+                "seq_len_used": int(X_ts2vec.shape[1]),
+                "embedding_shape": list(np.asarray(full_series_embeddings).shape),
+                "output_dir": str(group_dir),
+            }
+        )
+
+        # Avoid holding multiple large arrays/models longer than needed.
+        del model
+
+    if skipped_records:
+        pd.DataFrame(skipped_records).to_csv(
+            Path(output_root) / "skipped_records.csv", index=False
+        )
+    with open(Path(output_root) / "run_summary.json", "w") as f:
+        json.dump(run_summaries, f, indent=2)
+
+    print(f"Finished. Outputs written to: {output_root}")
+
+
+if __name__ == "__main__":
+    main()

From c38648cc173a0fed11f4a565df87c997ae8a5362 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 24 Feb 2026 17:49:58 -0500
Subject: [PATCH 347/401] finish putting embed in CV

---
 pydfc/ml_utils.py | 381 ++++++++++++----------------------------------
 1 file changed, 99 insertions(+), 282 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 165626b..3fe1262 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1613,87 +1613,43 @@ def get_classification_results(
     return RESULT
 
 
-def logistic_regression_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
-    """
-    Logistic regression classification
-
-    provide subj_label_train if you want to use StratifiedGroupKFold
-    to ensure that the same subject is not in both train and test sets
-    """
-    # create a pipeline with a logistic regression model to find the best C
-    logistic_reg = make_pipeline(
-        StandardScaler(),
-        LogisticRegression(penalty="l1", solver="saga", max_iter=2000, tol=1e-3),
-    )
-    # create a dictionary of all values we want to test for C
-    param_grid = {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10, 100]}
-
-    # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
-    # shuffle the data to ensure time points are shuffled
-    if subj_label_train is None:
-        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
-        cv = StratifiedKFold(n_splits=3)
-    else:
-        X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
-            X_train, y_train, subj_label_train
-        )
-        cv = StratifiedGroupKFold(n_splits=3)
-    # use gridsearch to test all values for C
-    lr_gscv = GridSearchCV(logistic_reg, param_grid, cv=cv, n_jobs=-1)
-    # fit model to data
-    if subj_label_train is None:
-        lr_gscv.fit(X_train_shuffled, y_train_shuffled)
-    else:
-        # use groups to ensure that the same subject is not in both train and test sets
-        lr_gscv.fit(X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled)
-
-    C = lr_gscv.best_params_["logisticregression__C"]
-
-    model = make_pipeline(
-        StandardScaler(),
-        LogisticRegression(penalty="l1", C=C, solver="saga", max_iter=2000, tol=1e-3),
-    )
-
-    RESULT = get_classification_results(
-        X_train=X_train,
-        X_test=X_test,
-        y_train=y_train,
-        y_test=y_test,
-        classifier_model=model,
-    )
-
-    return RESULT
-
-
-def SVM_classify(
+def logistic_regression_classify(
     X_train,
     y_train,
     X_test,
     y_test,
     subj_label_train=None,
-    embedding_method="PCA",
+    embedding_method=None,
 ):
+
     if embedding_method == "PCA":
         emb = PCA(whiten=False, svd_solver="full", random_state=0)
     elif embedding_method == "PLS":
         emb = PLSEmbedder(scale=False)  # IMPORTANT: avoid double scaling
+    elif embedding_method is None:
+        emb = None
     else:
         raise ValueError("embedding_method must be 'PCA' or 'PLS'.")
 
-    pipe = Pipeline(
-        [
-            ("scaler", StandardScaler(with_mean=True, with_std=True)),
-            ("emb", emb),
-            ("svc", SVC(kernel="rbf")),
-        ]
+    if emb is not None:
+        # Grid (keep small!)
+        param_grid = {
+            "emb__n_components": [5, 10, 20, 30, 50, 100],
+            "lr__C": [0.001, 0.01, 0.1, 1, 10, 100],
+        }
+    else:
+        param_grid = {"lr__C": [0.001, 0.01, 0.1, 1, 10, 100]}
+
+    steps = [("scaler", StandardScaler(with_mean=True, with_std=True))]
+
+    if emb is not None:
+        steps.append(("emb", emb))
+
+    steps.append(
+        ("lr", LogisticRegression(penalty="l1", solver="saga", max_iter=2000, tol=1e-3))
     )
 
-    # Grid (keep small!)
-    param_grid = {
-        "emb__n_components": [5, 10, 20, 30, 50, 100],
-        "svc__C": [0.1, 1, 10],
-        "svc__gamma": ["scale", 0.01, 0.1],
-    }
+    pipe = Pipeline(steps)
 
     # CV splitter
     if subj_label_train is None:
@@ -1724,195 +1680,60 @@ def SVM_classify(
 
 
 def SVM_classify(
-    X_train, y_train, X_test, y_test, subj_label_train=None, embedding_method="PCA"
+    X_train,
+    y_train,
+    X_test,
+    y_test,
+    subj_label_train=None,
+    embedding_method=None,
 ):
-    """
-    SVM classification
-
-    provide subj_label_train if you want to use StratifiedGroupKFold
-    to ensure that the same subject is not in both train and test sets
-    """
     if embedding_method == "PCA":
-        grid_embedding_name = "pca__n_components"
-        embedding_model = PCA(whiten=False, svd_solver="full")
+        emb = PCA(whiten=False, svd_solver="full", random_state=0)
     elif embedding_method == "PLS":
-        grid_embedding_name = "pls__n_components"
-        embedding_model = PLSEmbedder(scale=True)
-    # define the parameter grid
-    param_grid = {
-        grid_embedding_name: [5, 10, 20, 30, 50, 100],
-        "svc__C": [0.1, 1, 10],
-        "svc__gamma": ["scale", 0.01, 0.1],
-    }
-
-    # perform grid search
-    model_for_hyperparam = make_pipeline(
-        StandardScaler(),
-        embedding_model,
-        SVC(kernel="rbf"),
-    )
-    # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
-    # shuffle the data to ensure time points are shuffled
-    if subj_label_train is None:
-        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
-        cv = StratifiedKFold(n_splits=3)
-    else:
-        X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
-            X_train, y_train, subj_label_train
-        )
-        cv = StratifiedGroupKFold(n_splits=3)
-    model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=cv, n_jobs=-1)
-    if subj_label_train is None:
-        model_gscv.fit(X_train_shuffled, y_train_shuffled)
+        emb = PLSEmbedder(scale=False)  # IMPORTANT: avoid double scaling
+    elif embedding_method is None:
+        emb = None
     else:
-        model_gscv.fit(
-            X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled
-        )
-    n_components = model_gscv.best_params_[grid_embedding_name]
-    C = model_gscv.best_params_["svc__C"]
-    gamma = model_gscv.best_params_["svc__gamma"]
-
-    if embedding_method == "PCA":
-        embedding_model_final = PCA(
-            n_components=n_components, whiten=False, svd_solver="full"
-        )
-    elif embedding_method == "PLS":
-        embedding_model_final = PLSEmbedder(n_components=n_components, scale=True)
-
-    model = make_pipeline(
-        StandardScaler(),
-        embedding_model_final,
-        SVC(kernel="rbf", C=C, gamma=gamma),
-    )
+        raise ValueError("embedding_method must be 'PCA' or 'PLS'.")
 
-    RESULT = get_classification_results(
-        X_train=X_train,
-        X_test=X_test,
-        y_train=y_train,
-        y_test=y_test,
-        classifier_model=model,
-    )
-    return RESULT
+    if emb is not None:
+        # Grid (keep small!)
+        param_grid = {
+            "emb__n_components": [5, 10, 20, 30, 50, 100],
+            "svc__C": [0.1, 1, 10],
+            "svc__gamma": ["scale", 0.01, 0.1],
+        }
+    else:
+        param_grid = {
+            "svc__C": [0.1, 1, 10],
+            "svc__gamma": ["scale", 0.01, 0.1],
+        }
 
+    steps = [("scaler", StandardScaler(with_mean=True, with_std=True))]
 
-def KNN_classify(X_train, y_train, X_test, y_test, subj_label_train=None):
-    """
-    KNN classification
-    """
+    if emb is not None:
+        steps.append(("emb", emb))
 
-    # create a dictionary of all values we want to test for n_neighbors
-    param_grid = {"kneighborsclassifier__n_neighbors": np.arange(1, 30)}
+    steps.append(("svc", SVC(kernel="rbf")))
 
-    # perform grid search
-    model_for_hyperparam = make_pipeline(
-        StandardScaler(),
-        KNeighborsClassifier(),
-    )
+    pipe = Pipeline(steps)
 
-    # use StratifiedGroupKFold to ensure that the same subject is not in both train and test sets
-    # shuffle the data to ensure time points are shuffled
-    if subj_label_train is None:
-        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
-        cv = StratifiedKFold(n_splits=3)
-    else:
-        X_train_shuffled, y_train_shuffled, subj_label_train_shuffled = shuffle(
-            X_train, y_train, subj_label_train
-        )
-        cv = StratifiedGroupKFold(n_splits=3)
-    model_gscv = GridSearchCV(model_for_hyperparam, param_grid, cv=cv, n_jobs=-1)
+    # CV splitter
     if subj_label_train is None:
-        model_gscv.fit(X_train_shuffled, y_train_shuffled)
+        Xs, ys = shuffle(X_train, y_train, random_state=0)
+        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
+        fit_kwargs = {}
     else:
-        model_gscv.fit(
-            X_train_shuffled, y_train_shuffled, groups=subj_label_train_shuffled
-        )
-
-    n_neighbors = model_gscv.best_params_["kneighborsclassifier__n_neighbors"]
-
-    model = make_pipeline(
-        StandardScaler(),
-        KNeighborsClassifier(n_neighbors=n_neighbors),
-    )
-
-    RESULT = get_classification_results(
-        X_train=X_train,
-        X_test=X_test,
-        y_train=y_train,
-        y_test=y_test,
-        classifier_model=model,
-    )
-
-    return RESULT
-
-
-def random_forest_classify(X_train, y_train, X_test, y_test):
-    """
-    Random Forest classification
-    """
-    # create a pipeline with a random forest model to find the best n_estimators
-    rf = make_pipeline(
-        StandardScaler(),
-        RandomForestClassifier(),
-    )
-    # create a dictionary of all values we want to test for n_estimators
-    param_grid = {
-        "randomforestclassifier__n_estimators": [10, 50, 100, 200],
-        "randomforestclassifier__max_depth": [None, 5, 10, 20, 30],
-    }
-    # use gridsearch to test all values for n_estimators
-    rf_gscv = GridSearchCV(rf, param_grid, cv=5)
-    # fit model to data
-    rf_gscv.fit(X_train, y_train)
-
-    n_estimators = rf_gscv.best_params_["randomforestclassifier__n_estimators"]
-    max_depth = rf_gscv.best_params_["randomforestclassifier__max_depth"]
-
-    model = make_pipeline(
-        StandardScaler(),
-        RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth),
-    )
-
-    RESULT = get_classification_results(
-        X_train=X_train,
-        X_test=X_test,
-        y_train=y_train,
-        y_test=y_test,
-        classifier_model=model,
-    )
-
-    return RESULT
+        Xs, ys, gs = shuffle(X_train, y_train, subj_label_train, random_state=0)
+        cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=0)
+        fit_kwargs = {"groups": gs}
 
+    # GridSearch on training subjects only
+    gscv = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1, scoring="balanced_accuracy")
+    gscv.fit(Xs, ys, **fit_kwargs)
 
-def gradient_boosting_classify(X_train, y_train, X_test, y_test):
-    """
-    Gradient Boosting classification
-    """
-    # create a pipeline with a gradient boosting model to find the best n_estimators
-    gb = make_pipeline(
-        StandardScaler(),
-        GradientBoostingClassifier(),
-    )
-    # create a dictionary of all values we want to test for n_estimators
-    param_grid = {
-        "gradientboostingclassifier__n_estimators": [10, 50, 100, 200],
-        "gradientboostingclassifier__learning_rate": [0.01, 0.1, 0.2],
-        "gradientboostingclassifier__max_depth": [3, 5, 10],
-    }
-    # use gridsearch to test all values for n_estimators
-    gb_gscv = GridSearchCV(gb, param_grid, cv=5)
-    # fit model to data
-    gb_gscv.fit(X_train, y_train)
-
-    n_estimators = gb_gscv.best_params_["gradientboostingclassifier__n_estimators"]
-    learning_rate = gb_gscv.best_params_["gradientboostingclassifier__learning_rate"]
-    max_depth = gb_gscv.best_params_["gradientboostingclassifier__max_depth"]
-
-    model = make_pipeline(
-        StandardScaler(),
-        GradientBoostingClassifier(
-            n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate
-        ),
-    )
+    # Evaluate with best estimator (already refit on full training set by default)
+    model = gscv.best_estimator_
 
     RESULT = get_classification_results(
         X_train=X_train,
@@ -1921,7 +1742,7 @@ def gradient_boosting_classify(X_train, y_train, X_test, y_test):
         y_test=y_test,
         classifier_model=model,
     )
-
+    RESULT["best_params"] = gscv.best_params_
     return RESULT
 
 
@@ -2183,6 +2004,10 @@ def task_presence_classification(
         # raise error
         raise ValueError(f"Unknown measure name: {measure_name}")
 
+    if measure_is_state_based:
+        X_train = process_SB_features(X=X_train, measure_name=measure_name)
+        X_test = process_SB_features(X=X_test, measure_name=measure_name)
+
     ML_scores = {
         "group_lvl": {
             "task": list(),
@@ -2203,35 +2028,14 @@ def task_presence_classification(
         },
     }
 
-    EMBEDDINGS = ["PCA", "PLS", "LE"]
+    EMBEDDINGS = ["PCA", "PLS"]
     check_count = len(EMBEDDINGS)
     num_excluded_subjects = 0
     for embedding in EMBEDDINGS:
         if measure_is_state_based:
-            X_train_embedded = process_SB_features(X=X_train, measure_name=measure_name)
-            X_test_embedded = process_SB_features(X=X_test, measure_name=measure_name)
+            embedding_to_use = None
         else:
-            # embed dFC features
-            try:
-                X_train_embedded, X_test_embedded = embed_dFC_features(
-                    train_subjects=train_subjects,
-                    test_subjects=test_subjects,
-                    X_train=X_train,
-                    X_test=X_test,
-                    y_train=y_train,
-                    y_test=y_test,
-                    subj_label_train=subj_label_train,
-                    subj_label_test=subj_label_test,
-                    embedding=embedding,
-                    n_components="auto",
-                    n_neighbors_LE=125,
-                    LE_embedding_method="embed+procrustes",
-                    measure_is_state_based=measure_is_state_based,
-                )
-            except Exception as e:
-                print(f"Error in embedding dFC features with {embedding}: {e}")
-                check_count -= 1
-                continue
+            embedding_to_use = embedding
 
         # check if both classes are present in train and test sets
         if len(np.unique(y_train)) < 2 or len(np.unique(y_test)) < 2:
@@ -2241,42 +2045,55 @@ def task_presence_classification(
             check_count -= 1
             continue
 
-        # Silhouette score
-        # SI does not need to be separated for train and test sets
-        # we will use the same SI for both train and test sets
-        # using all samples from train and test sets
-        X_combined = np.concatenate((X_train_embedded, X_test_embedded), axis=0)
-        y_combined = np.concatenate((y_train, y_test), axis=0)
-
-        SI = {
-            "train": silhouette_score(X_combined, y_combined),
-            "test": silhouette_score(X_combined, y_combined),
-        }
-
         # task presence classification
 
         print("task presence classification ...")
 
         # logistic regression
         log_reg_RESULT = logistic_regression_classify(
-            X_train=X_train_embedded,
+            X_train=X_train,
             y_train=y_train,
-            X_test=X_test_embedded,
+            X_test=X_test,
             y_test=y_test,
             subj_label_train=subj_label_train,
+            embedding_method=embedding_to_use,
         )
 
         # SVM
         SVM_RESULT = SVM_classify(
-            X_train=X_train_embedded,
+            X_train=X_train,
             y_train=y_train,
-            X_test=X_test_embedded,
+            X_test=X_test,
             y_test=y_test,
             subj_label_train=subj_label_train,
+            embedding_method=embedding_to_use,
         )
 
         ML_models = {"Logistic regression": log_reg_RESULT, "SVM": SVM_RESULT}
 
+        # Silhouette score
+        # SI does not need to be separated for train and test sets
+        # we will use the same SI for both train and test sets
+        # using all samples from train and test sets
+        # use the embedding and scaler trained in SVM_RESULT["model"]
+        # so the results are comparable to the classification scores
+        scaler = SVM_RESULT["classifier_model"].named_steps["scaler"]
+        embedding_model = SVM_RESULT["classifier_model"].named_steps.get("emb", None)
+        if embedding_model is not None:
+            X_train_embedded = embedding_model.transform(scaler.transform(X_train))
+            X_test_embedded = embedding_model.transform(scaler.transform(X_test))
+        else:
+            X_train_embedded = scaler.transform(X_train)
+            X_test_embedded = scaler.transform(X_test)
+
+        X_combined = np.concatenate((X_train_embedded, X_test_embedded), axis=0)
+        y_combined = np.concatenate((y_train, y_test), axis=0)
+
+        SI = {
+            "train": silhouette_score(X_combined, y_combined),
+            "test": silhouette_score(X_combined, y_combined),
+        }
+
         # # permutation tests
         # permutation_scores = {
         #     "train": {},

From 138b6ceb8760f1f14979447d392f4afc1025a0b1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 25 Feb 2026 10:40:58 -0500
Subject: [PATCH 348/401] minor

---
 .../train_ts2vec_dfc_embeddings.py            | 69 ++++++++++++++++---
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py b/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py
index f594b0b..471d303 100644
--- a/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py
+++ b/task_dFC/multi_dataset_analysis/train_ts2vec_dfc_embeddings.py
@@ -1,4 +1,5 @@
 import argparse
+import inspect
 import json
 import os
 from pathlib import Path
@@ -375,6 +376,65 @@ def instantiate_ts2vec(
     )
 
 
+def fit_ts2vec_adaptive(
+    model: Any, X_ts2vec: np.ndarray, fit_kwargs: Dict[str, Any]
+) -> Dict[str, Any]:
+    """
+    Call TS2Vec.fit using only kwargs supported by the installed implementation.
+    Returns the effective kwargs that were actually used.
+    """
+    try:
+        sig = inspect.signature(model.fit)
+        params = sig.parameters
+        accepts_var_kw = any(
+            p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+        )
+        if accepts_var_kw:
+            _ = model.fit(X_ts2vec, **fit_kwargs)
+            return dict(fit_kwargs)
+
+        allowed = {
+            name
+            for name, p in params.items()
+            if name != "self"
+            and p.kind
+            in (
+                inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                inspect.Parameter.KEYWORD_ONLY,
+            )
+        }
+        effective_fit_kwargs = {k: v for k, v in fit_kwargs.items() if k in allowed}
+        _ = model.fit(X_ts2vec, **effective_fit_kwargs)
+        return effective_fit_kwargs
+    except (TypeError, ValueError):
+        # Some wrapped methods don't expose a reliable signature.
+        last_error = None
+        candidate_kwarg_sets = [
+            dict(fit_kwargs),
+            {k: v for k, v in fit_kwargs.items() if k != "batch_size"},
+            {k: v for k, v in fit_kwargs.items() if k not in {"batch_size", "lr"}},
+            {k: v for k, v in fit_kwargs.items() if k == "n_epochs"},
+            {},
+        ]
+
+        seen = set()
+        for kw in candidate_kwarg_sets:
+            key = tuple(sorted((str(k), str(v)) for k, v in kw.items()))
+            if key in seen:
+                continue
+            seen.add(key)
+            try:
+                _ = model.fit(X_ts2vec, **kw)
+                return kw
+            except TypeError as e:
+                last_error = e
+                continue
+
+        raise TypeError(
+            f"Could not call TS2Vec.fit with tested kwargs variants: {last_error}"
+        )
+
+
 def main() -> None:
     parser = build_parser()
     args = parser.parse_args()
@@ -613,14 +673,7 @@ def main() -> None:
         fit_kwargs.update(ts2vec_fit_extra)
 
         model, effective_init_kwargs = instantiate_ts2vec(TS2Vec, init_kwargs)
-        try:
-            _ = model.fit(X_ts2vec, **fit_kwargs)
-        except TypeError:
-            # Different TS2Vec implementations expose different fit signatures.
-            fit_kwargs_fallback = dict(fit_kwargs)
-            fit_kwargs_fallback.pop("lr", None)
-            _ = model.fit(X_ts2vec, **fit_kwargs_fallback)
-            fit_kwargs = fit_kwargs_fallback
+        fit_kwargs = fit_ts2vec_adaptive(model, X_ts2vec, fit_kwargs)
 
         encode_kwargs: Dict[str, Any] = {"encoding_window": encoding_window}
         encode_kwargs.update(ts2vec_encode_extra)

From 1936ab020fabb618952d010792aa1c15d135f4b6 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 25 Feb 2026 11:02:21 -0500
Subject: [PATCH 349/401] minor bug

---
 pydfc/ml_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 3fe1262..9ed5093 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -2077,8 +2077,8 @@ def task_presence_classification(
         # using all samples from train and test sets
         # use the embedding and scaler trained in SVM_RESULT["model"]
         # so the results are comparable to the classification scores
-        scaler = SVM_RESULT["classifier_model"].named_steps["scaler"]
-        embedding_model = SVM_RESULT["classifier_model"].named_steps.get("emb", None)
+        scaler = SVM_RESULT["model"].named_steps["scaler"]
+        embedding_model = SVM_RESULT["model"].named_steps.get("emb", None)
         if embedding_model is not None:
             X_train_embedded = embedding_model.transform(scaler.transform(X_train))
             X_test_embedded = embedding_model.transform(scaler.transform(X_test))

From 67cd5167d60c2b7b34de9dbb44ab513e204e6ee9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 25 Feb 2026 12:01:17 -0500
Subject: [PATCH 350/401] minor bug

---
 pydfc/ml_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 9ed5093..91302ac 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -2161,11 +2161,11 @@ def task_presence_classification(
         for subj in SUBJECTS:
             if subj in train_subjects:
                 subj_group = "train"
-                features = X_train_embedded[subj_label_train == subj, :]
+                features = X_train[subj_label_train == subj, :]
                 target = y_train[subj_label_train == subj]
             elif subj in test_subjects:
                 subj_group = "test"
-                features = X_test_embedded[subj_label_test == subj, :]
+                features = X_test[subj_label_test == subj, :]
                 target = y_test[subj_label_test == subj]
             # check if only one class is present, skip the subject
             if len(np.unique(target)) < 2:

From 2ba8ab5ea18df8695d3272687c89f83e96bfdeba Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 28 Feb 2026 21:19:23 -0500
Subject: [PATCH 351/401] remove LE from ml_results

---
 task_dFC/multi_dataset_analysis/ml_results.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 1fd2f18..ae5fef1 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -158,12 +158,9 @@
     GROUP = "test"
     TARGETS = [
         ("PCA", "Logistic regression balanced accuracy"),
-        ("LE", "Logistic regression balanced accuracy"),
         ("PLS", "Logistic regression balanced accuracy"),
         ("PCA", "SVM balanced accuracy"),
-        ("LE", "SVM balanced accuracy"),
         ("PLS", "SVM balanced accuracy"),
-        ("LE", "SI"),
         ("PCA", "SI"),
         ("PLS", "SI"),
     ]

From 831954fe9e2d78965fdcce1c3286666ddae8f7eb Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 28 Feb 2026 22:37:59 -0500
Subject: [PATCH 352/401] minor n_jobs

---
 pydfc/ml_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 91302ac..eb16619 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -1662,7 +1662,7 @@ def logistic_regression_classify(
         fit_kwargs = {"groups": gs}
 
     # GridSearch on training subjects only
-    gscv = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1, scoring="balanced_accuracy")
+    gscv = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=1, scoring="balanced_accuracy")
     gscv.fit(Xs, ys, **fit_kwargs)
 
     # Evaluate with best estimator (already refit on full training set by default)
@@ -1729,7 +1729,7 @@ def SVM_classify(
         fit_kwargs = {"groups": gs}
 
     # GridSearch on training subjects only
-    gscv = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1, scoring="balanced_accuracy")
+    gscv = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=1, scoring="balanced_accuracy")
     gscv.fit(Xs, ys, **fit_kwargs)
 
     # Evaluate with best estimator (already refit on full training set by default)

From c38bcd6095bdc0da31e9a4b8d39ca4dffabf4c67 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 2 Mar 2026 10:10:41 -0500
Subject: [PATCH 353/401] minor

---
 .../sample_matrix_visualization.py            | 184 +++++++++---------
 1 file changed, 92 insertions(+), 92 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 39310ca..f3e9800 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -19,7 +19,7 @@
 )
 
 use_raw_features = False  # if True, use raw dFC features instead of embedded features
-normalize_dFC = True
+normalize_dFC = False
 FCS_proba_for_SB = True
 train_test_ratio = 0.8
 embedding = "LE"
@@ -196,36 +196,36 @@
                             X_test_embedded = process_SB_features(
                                 X=X_test, measure_name=measure_name
                             )
-                        else:
-                            # embed dFC features
-                            try:
-                                X_train_embedded, X_test_embedded = embed_dFC_features(
-                                    train_subjects=train_subjects,
-                                    test_subjects=test_subjects,
-                                    X_train=X_train,
-                                    X_test=X_test,
-                                    y_train=y_train,
-                                    y_test=y_test,
-                                    subj_label_train=subj_label_train,
-                                    subj_label_test=subj_label_test,
-                                    embedding=embedding,
-                                    n_components="auto",
-                                    n_neighbors_LE=125,
-                                    LE_embedding_method="embed+procrustes",
-                                    measure_is_state_based=measure_is_state_based,
-                                )
-                                assert (
-                                    X_train_embedded.shape[0] == y_train.shape[0]
-                                ), "Number of samples do not match."
-                                assert (
-                                    X_test_embedded.shape[0] == y_test.shape[0]
-                                ), "Number of samples do not match."
-                            except Exception as e:
-                                print(
-                                    f"Error in embedding dFC features with {embedding}: {e}"
-                                )
-                                X_train_embedded = None
-                                X_test_embedded = None
+                        # else:
+                        #     # embed dFC features
+                        #     try:
+                        #         X_train_embedded, X_test_embedded = embed_dFC_features(
+                        #             train_subjects=train_subjects,
+                        #             test_subjects=test_subjects,
+                        #             X_train=X_train,
+                        #             X_test=X_test,
+                        #             y_train=y_train,
+                        #             y_test=y_test,
+                        #             subj_label_train=subj_label_train,
+                        #             subj_label_test=subj_label_test,
+                        #             embedding=embedding,
+                        #             n_components="auto",
+                        #             n_neighbors_LE=125,
+                        #             LE_embedding_method="embed+procrustes",
+                        #             measure_is_state_based=measure_is_state_based,
+                        #         )
+                        #         assert (
+                        #             X_train_embedded.shape[0] == y_train.shape[0]
+                        #         ), "Number of samples do not match."
+                        #         assert (
+                        #             X_test_embedded.shape[0] == y_test.shape[0]
+                        #         ), "Number of samples do not match."
+                        #     except Exception as e:
+                        #         print(
+                        #             f"Error in embedding dFC features with {embedding}: {e}"
+                        #         )
+                        #         X_train_embedded = None
+                        #         X_test_embedded = None
 
                         assert (
                             task not in DATA
@@ -233,8 +233,8 @@
                         DATA[task] = {
                             "X_train": X_train,
                             "X_test": X_test,
-                            "X_train_embedded": X_train_embedded,
-                            "X_test_embedded": X_test_embedded,
+                            # "X_train_embedded": X_train_embedded,
+                            # "X_test_embedded": X_test_embedded,
                             "y_train": y_train,
                             "y_test": y_test,
                             "subj_label_train": subj_label_train,
@@ -244,21 +244,21 @@
             # save the data
             # save each task in a separate file and name the file as the task name, measure name, and dataset name
             for task in DATA.keys():
-                if use_raw_features:
-                    X_train = DATA[task]["X_train"]
-                    X_test = DATA[task]["X_test"]
-                else:
-                    X_train = DATA[task]["X_train_embedded"]
-                    X_test = DATA[task]["X_test_embedded"]
-                y_train = DATA[task]["y_train"]
-                y_test = DATA[task]["y_test"]
-                subj_label_train = DATA[task]["subj_label_train"]
-                subj_label_test = DATA[task]["subj_label_test"]
-                measure_name = DATA[task]["measure_name"]
-
-                if X_train is None or X_test is None:
-                    print(f"Skipping task {task} due to embedding error.")
-                    continue
+                # if use_raw_features:
+                #     X_train = DATA[task]["X_train"]
+                #     X_test = DATA[task]["X_test"]
+                # else:
+                #     X_train = DATA[task]["X_train_embedded"]
+                #     X_test = DATA[task]["X_test_embedded"]
+                # y_train = DATA[task]["y_train"]
+                # y_test = DATA[task]["y_test"]
+                # subj_label_train = DATA[task]["subj_label_train"]
+                # subj_label_test = DATA[task]["subj_label_test"]
+                # measure_name = DATA[task]["measure_name"]
+
+                # if X_train is None or X_test is None:
+                #     print(f"Skipping task {task} due to embedding error.")
+                #     continue
 
                 if not os.path.exists(f"{output_root}/processed_data"):
                     os.makedirs(f"{output_root}/processed_data")
@@ -267,47 +267,47 @@
                     DATA[task],
                 )
 
-                for group, X, y in zip(
-                    ["train", "test"], [X_train, X_test], [y_train, y_test]
-                ):
-                    # if the folder does not exist, create it
-                    if not os.path.exists(f"{output_root}/{measure_name}"):
-                        os.makedirs(f"{output_root}/{measure_name}")
-
-                    # A) Unsorted (your first vis, but rotated so time is horizontal)
-                    plot_samples_features(
-                        X,
-                        y,
-                        sample_order="original",
-                        feature_order="original",
-                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
-                        show=False,
-                    )
-
-                    # B) Label-sorted (your third vis)
-                    plot_samples_features(
-                        X,
-                        y,
-                        sample_order="label",
-                        feature_order="original",
-                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
-                        show=False,
-                    )
-
-                    # C) clustering
-                    plot_samples_features(
-                        X,
-                        y,
-                        sample_order="cluster",
-                        feature_order="original",
-                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_clustered-samples_{task}_{group}{raw_or_embedded}.png",
-                        show=False,
-                    )
-
-                    save_scalar_colorbar(
-                        cmap="coolwarm",
-                        vmin=-1.6,
-                        vmax=1.6,  # use the same V_RANGE you use in plots
-                        label="z-scored feature value",
-                        filename=f"{output_root}/zscore_colorbar.png",
-                    )
+                # for group, X, y in zip(
+                #     ["train", "test"], [X_train, X_test], [y_train, y_test]
+                # ):
+                #     # if the folder does not exist, create it
+                #     if not os.path.exists(f"{output_root}/{measure_name}"):
+                #         os.makedirs(f"{output_root}/{measure_name}")
+
+                #     # A) Unsorted (your first vis, but rotated so time is horizontal)
+                #     plot_samples_features(
+                #         X,
+                #         y,
+                #         sample_order="original",
+                #         feature_order="original",
+                #         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
+                #         show=False,
+                #     )
+
+                #     # B) Label-sorted (your third vis)
+                #     plot_samples_features(
+                #         X,
+                #         y,
+                #         sample_order="label",
+                #         feature_order="original",
+                #         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
+                #         show=False,
+                #     )
+
+                #     # C) clustering
+                #     plot_samples_features(
+                #         X,
+                #         y,
+                #         sample_order="cluster",
+                #         feature_order="original",
+                #         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_clustered-samples_{task}_{group}{raw_or_embedded}.png",
+                #         show=False,
+                #     )
+
+                #     save_scalar_colorbar(
+                #         cmap="coolwarm",
+                #         vmin=-1.6,
+                #         vmax=1.6,  # use the same V_RANGE you use in plots
+                #         label="z-scored feature value",
+                #         filename=f"{output_root}/zscore_colorbar.png",
+                #     )

From 628eefab3ec148d85da119fb39df6ab09f53877b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 2 Mar 2026 12:52:21 -0500
Subject: [PATCH 354/401] important changes in normalization

---
 pydfc/ml_utils.py | 16 +++++++++++-----
 task_dFC/ML.py    |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index eb16619..7052efa 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -235,7 +235,7 @@ def dFC_feature_extraction_subj_lvl(
     dFC,
     task_data,
     dynamic_pred="no",
-    normalize_dFC=True,
+    normalize_dFC=False,
     FCS_proba_for_SB=True,
 ):
     """
@@ -327,7 +327,7 @@ def dFC_feature_extraction(
     run=None,
     session=None,
     dynamic_pred="no",
-    normalize_dFC=True,
+    normalize_dFC=False,
     FCS_proba_for_SB=True,
 ):
     """
@@ -1640,7 +1640,7 @@ def logistic_regression_classify(
     else:
         param_grid = {"lr__C": [0.001, 0.01, 0.1, 1, 10, 100]}
 
-    steps = [("scaler", StandardScaler(with_mean=True, with_std=True))]
+    steps = [("scaler", StandardScaler())]
 
     if emb is not None:
         steps.append(("emb", emb))
@@ -1709,7 +1709,7 @@ def SVM_classify(
             "svc__gamma": ["scale", 0.01, 0.1],
         }
 
-    steps = [("scaler", StandardScaler(with_mean=True, with_std=True))]
+    steps = [("scaler", StandardScaler())]
 
     if emb is not None:
         steps.append(("emb", emb))
@@ -1945,7 +1945,7 @@ def task_presence_classification(
     run=None,
     session=None,
     dynamic_pred="no",
-    normalize_dFC=True,
+    normalize_dFC=False,
     train_test_ratio=0.8,
 ):
     """
@@ -2008,6 +2008,12 @@ def task_presence_classification(
         X_train = process_SB_features(X=X_train, measure_name=measure_name)
         X_test = process_SB_features(X=X_test, measure_name=measure_name)
 
+    # center the data by subject before embedding to remove subject effects
+    # separately for train and test sets to avoid data leakage
+    # for both state-based and state-free methods
+    X_train = subject_center(X_train, subj_label_train, mode="demean")
+    X_test = subject_center(X_test, subj_label_test, mode="demean")
+
     ML_scores = {
         "group_lvl": {
             "task": list(),
diff --git a/task_dFC/ML.py b/task_dFC/ML.py
index 136c3ef..271fa5d 100644
--- a/task_dFC/ML.py
+++ b/task_dFC/ML.py
@@ -93,7 +93,7 @@ def run_classification(
     dFC_root,
     output_root,
     dynamic_pred="no",
-    normalize_dFC=True,
+    normalize_dFC=False,
     n_jobs=-1,  # Number of parallel jobs; -1 = all available cores
 ):
     for session in SESSIONS:
@@ -239,7 +239,7 @@ def run_classification(
             dFC_root=dFC_root,
             output_root=ML_root,
             dynamic_pred="no",
-            normalize_dFC=True,
+            normalize_dFC=False,
             n_jobs=8,
         )
     except Exception as e:

From c5ea8db88661060a90b94010af464d94e8d7b105 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 12:35:16 -0400
Subject: [PATCH 355/401] refactor ml_results

---
 .../helper_functions.py                       | 452 +++------
 task_dFC/multi_dataset_analysis/ml_results.py | 939 +++++++++---------
 2 files changed, 617 insertions(+), 774 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 242cc00..cb584c1 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1,13 +1,10 @@
-import colorsys
 import math
 import re
-import textwrap
 from pathlib import Path
 
 import matplotlib as mpl
 import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
-import matplotlib.transforms as mtransforms
 import numpy as np
 import pandas as pd
 import seaborn as sns
@@ -62,104 +59,54 @@ def savefig_pub(path_png_or_pdf: str):
 ###################### ml_results ######################
 
 
-def get_cog_domain_info(simul_or_real: str):
-    """
-    Return:
-        DOMAIN_ORDER: list of domains in preferred order
-        TASK2DOMAIN: dict mapping canonical task codes to domains
-        DOMAIN_BASE: dict mapping domains to base colors (hex)
-    """
-    if simul_or_real == "real":
-        # --- Cognitive-Atlas–aligned domains (order on paper) ---
-        DOMAIN_ORDER = [
-            "Arousal & Regulatory Systems",
-            "Cognitive Systems",
-            "Negative Valence System",
-            "Positive Valence System",
-            "Sensorimotor Systems",
-        ]
-
-        # --- Map canonical task codes -> domain ---
-        TASK2DOMAIN = {
-            # Language & Regulatory Systems
-            "emotionregulation": "Arousal & Regulatory Systems",
-            # Cognitive Systems
-            "audsem": "Cognitive Systems",
-            "visrhyme": "Cognitive Systems",
-            "vissem": "Cognitive Systems",
-            "visspell": "Cognitive Systems",
-            "arithmetic": "Cognitive Systems",
-            "stroop": "Cognitive Systems",
-            "cuedts": "Cognitive Systems",
-            "axcpt": "Cognitive Systems",
-            "matching": "Cognitive Systems",
-            "stern": "Cognitive Systems",
-            "st": "Cognitive Systems",
-            "vswm": "Cognitive Systems",
-            "expo": "Cognitive Systems",
-            "recall": "Cognitive Systems",
-            "feedback": "Cognitive Systems",
-            "ppalocalizer": "Cognitive Systems",
-            "localiser": "Cognitive Systems",
-            "localizer": "Cognitive Systems",
-            # Positive Valence System
-            "cic": "Positive Valence System",
-            "fribbids": "Positive Valence System",
-            "risk": "Positive Valence System",
-            "itc": "Positive Valence System",
-            # Negative Valence System
-            "fearlearning": "Negative Valence System",
-            "paingen": "Negative Valence System",
-            # Sensorimotor
-            "motor": "Sensorimotor Systems",
-            "execution": "Sensorimotor Systems",
-            "imagery": "Sensorimotor Systems",
-            "ihg": "Sensorimotor Systems",
-        }
-        # base colors per domain (distinct, colorblind-friendly)
-        DOMAIN_BASE = {
-            "Arousal & Regulatory Systems": "#9467bd",
-            "Cognitive Systems": "#ff7f0e",
-            "Positive Valence System": "#02833E",
-            "Negative Valence System": "#d62728",
-            "Sensorimotor Systems": "#1f77b4",
-        }
-    elif simul_or_real == "simulated":
-        # --- Categories of simulated task paradigms ---
-        DOMAIN_ORDER = [
-            "Simulated Periodic",
-            "Strong Performance on Real Data",
-            "Weak Performance on Real Data",
-        ]
-        # --- Map task codes -> category ---
-        TASK2DOMAIN = {
-            # Simulated Periodic
-            "lowfreqlongrest": "Simulated Periodic",
-            "lowfreqshortrest": "Simulated Periodic",
-            "lowfreqshorttask": "Simulated Periodic",
-            # Optimal Paradigm Design, Strong Performance on Real Data
-            "axcpt": "Strong Performance on Real Data",
-            "stern": "Strong Performance on Real Data",
-            "cuedts": "Strong Performance on Real Data",
-            # Optimal Paradigm Design, Weak Performance on Real Data
-            "execution": "Weak Performance on Real Data",
-            "imagery": "Weak Performance on Real Data",
-            "localizer": "Weak Performance on Real Data",
-            "ppalocalizer": "Weak Performance on Real Data",
-            # Sub-Optimal Paradigm Design, Weak Performance on Real Data
-            "itc": "Weak Performance on Real Data",
-            "stroop": "Weak Performance on Real Data",
-            "risk": "Weak Performance on Real Data",
-        }
-        # base colors per domain (distinct, colorblind-friendly)
-        DOMAIN_BASE = {
-            "Simulated Periodic": "#1f77b4",
-            "Strong Performance on Real Data": "#02833E",
-            "Weak Performance on Real Data": "#d62728",
-        }
-    else:
-        raise ValueError(f"Invalid simul_or_real: {simul_or_real}")
-    return DOMAIN_ORDER, TASK2DOMAIN, DOMAIN_BASE
+DEFAULT_EXPERIMENT_NAME_MAP = {
+    "real": {
+        "emotionregulation": "exp1",
+        "audsem": "exp2",
+        "visrhyme": "exp3",
+        "vissem": "exp4",
+        "visspell": "exp5",
+        "arithmetic": "exp6",
+        "stroop": "exp7",
+        "cuedts": "exp8",
+        "axcpt": "exp9",
+        "matching": "exp10",
+        "stern": "exp11",
+        "st": "exp12",
+        "vswm": "exp13",
+        "expo": "exp14",
+        "recall": "exp15",
+        "feedback": "exp16",
+        "ppalocalizer": "exp17",
+        "localiser": "exp18",
+        "localizer": "exp19",
+        "cic": "exp20",
+        "fribbids": "exp21",
+        "risk": "exp22",
+        "itc": "exp23",
+        "fearlearning": "exp24",
+        "paingen": "exp25",
+        "motor": "exp26",
+        "execution": "exp27",
+        "imagery": "exp28",
+        "ihg": "exp29",
+    },
+    "simulated": {
+        "lowfreqlongrest": "exp1",
+        "lowfreqshortrest": "exp2",
+        "lowfreqshorttask": "exp3",
+        "axcpt": "exp4",
+        "stern": "exp5",
+        "cuedts": "exp6",
+        "execution": "exp7",
+        "imagery": "exp8",
+        "localizer": "exp9",
+        "ppalocalizer": "exp10",
+        "itc": "exp11",
+        "stroop": "exp12",
+        "risk": "exp13",
+    },
+}
 
 
 def canon_task(task_str: str) -> str:
@@ -169,88 +116,84 @@ def canon_task(task_str: str) -> str:
     return s.lower()
 
 
-def task_domain_real(task: str) -> str:
-    _, TASK2DOMAIN, _ = get_cog_domain_info("real")
-    return TASK2DOMAIN.get(canon_task(task), "Other")
+def get_default_experiment_name_map(simul_or_real: str):
+    if simul_or_real not in DEFAULT_EXPERIMENT_NAME_MAP:
+        raise ValueError(f"Invalid simul_or_real: {simul_or_real}")
+    return DEFAULT_EXPERIMENT_NAME_MAP[simul_or_real].copy()
+
+
+def get_present_task_order(tasks_iterable, task_reference_order):
+    present_tasks = list(dict.fromkeys(tasks_iterable))
+    present_set = set(present_tasks)
+    ordered_tasks = [task for task in task_reference_order if task in present_set]
+    remaining_tasks = sorted(
+        [task for task in present_tasks if task not in ordered_tasks],
+        key=lambda task: task.lower(),
+    )
+    return ordered_tasks + remaining_tasks
 
 
-def task_domain_simul(task: str) -> str:
-    _, TASK2DOMAIN, _ = get_cog_domain_info("simulated")
-    return TASK2DOMAIN.get(canon_task(task), "Other")
+def _next_available_experiment_label(used_labels_lower):
+    index = 1
+    while f"exp{index}" in used_labels_lower:
+        index += 1
+    return f"exp{index}"
 
 
-def shade_series_same_hue(base_hex: str, n: int, delta_L=0.08, delta_S=0.06):
+def build_experiment_display_info(tasks_iterable, task_reference_order, simul_or_real):
     """
-    Same hue; small, symmetric tweaks in lightness/saturation → very similar colors.
-    delta_L/S control how similar the shades are (smaller = more similar).
+    Resolve task order, experiment labels, and a stable palette for ML result plots.
+
+    Edit ``DEFAULT_EXPERIMENT_NAME_MAP`` above to change experiment labels.
+    Any task not listed there is auto-assigned the next available ``expN`` label.
     """
-    if n <= 1:
-        return [base_hex]
-    r, g, b = mcolors.to_rgb(base_hex)
-    # colorsys uses HLS (Hue, Lightness, Saturation)
-    h, l, s = colorsys.rgb_to_hls(r, g, b)
-
-    # symmetric lightness offsets around original l
-    offs_L = np.linspace(-delta_L, +delta_L, n)
-    # small saturation jitter to avoid identical look
-    offs_S = np.linspace(-delta_S, +delta_S, n)
-
-    cols = []
-    for dL, dS in zip(offs_L, offs_S):
-        li = float(np.clip(l + dL, 0.05, 0.95))
-        si = float(np.clip(s + dS, 0.20, 0.95))
-        r2, g2, b2 = colorsys.hls_to_rgb(h, li, si)
-        cols.append(mcolors.to_hex((r2, g2, b2)))
-    return cols
-
-
-def build_task_order_and_palette(
-    tasks_iterable, simul_or_real, similarity_L=0.08, similarity_S=0.06
-):
-    """Domain-first task order + very-similar shades per domain."""
-    tasks = list(tasks_iterable)
-    if simul_or_real == "real":
-        dom_of = {t: task_domain_real(t) for t in tasks}
-    elif simul_or_real == "simulated":
-        dom_of = {t: task_domain_simul(t) for t in tasks}
-
-    DOMAIN_ORDER, _, DOMAIN_BASE = get_cog_domain_info(simul_or_real)
-    # order: by DOMAIN_ORDER, then alphabetical within domain
-    task_order = []
-    for dom in DOMAIN_ORDER:
-        ts = sorted([t for t in tasks if dom_of[t] == dom], key=lambda s: s.lower())
-        task_order.extend(ts)
-
-    # palette: near-identical shades per domain
-    palette = {}
-    for dom in DOMAIN_ORDER:
-        ts = [t for t in task_order if dom_of.get(t, "Other") == dom]
-        if not ts:
-            continue
-        shades = shade_series_same_hue(
-            DOMAIN_BASE[dom], len(ts), delta_L=similarity_L, delta_S=similarity_S
-        )
-        for t, col in zip(ts, shades):
-            palette[t] = col
-    return task_order, palette
-
-
-def domain_sorted_rows(index_tasks, TASKS_to_include, simul_or_real):
-    # preserve only tasks present in the matrix
-    present = [t for t in index_tasks if t in TASKS_to_include]
-    # if simul_or_real != "real":
-    #     return sorted(present, key=lambda s: s.lower())
-    # domain-first, then alphabetical
-    if simul_or_real == "real":
-        dom_of = {t: task_domain_real(t) for t in present}
-    elif simul_or_real == "simulated":
-        dom_of = {t: task_domain_simul(t) for t in present}
-    DOMAIN_ORDER, _, _ = get_cog_domain_info(simul_or_real)
-    ordered = []
-    for dom in DOMAIN_ORDER:
-        ts = sorted([t for t in present if dom_of[t] == dom], key=lambda s: s.lower())
-        ordered.extend(ts)
-    return ordered
+    task_order = get_present_task_order(tasks_iterable, task_reference_order)
+    configured_map = get_default_experiment_name_map(simul_or_real)
+
+    task_to_experiment = {}
+    used_labels = {}
+    used_labels_lower = set()
+
+    for task in task_order:
+        experiment_label = configured_map.get(canon_task(task))
+        if experiment_label is None:
+            experiment_label = _next_available_experiment_label(used_labels_lower)
+
+        experiment_label_key = experiment_label.lower()
+        if experiment_label_key in used_labels:
+            raise ValueError(
+                "Experiment labels must be unique for the plotted tasks. "
+                f"Both '{used_labels[experiment_label_key]}' and '{task}' map to "
+                f"'{experiment_label}'."
+            )
+
+        task_to_experiment[task] = experiment_label
+        used_labels[experiment_label_key] = task
+        used_labels_lower.add(experiment_label_key)
+
+    colors = sns.color_palette("husl", n_colors=max(1, len(task_order)))
+    experiment_order = [task_to_experiment[task] for task in task_order]
+    experiment_palette = {
+        experiment_label: mcolors.to_hex(color)
+        for experiment_label, color in zip(experiment_order, colors)
+    }
+
+    return task_order, task_to_experiment, experiment_order, experiment_palette
+
+
+def relabel_heatmap_rows(matrix_df, annot_df, task_reference_order, task_to_experiment):
+    row_order = get_present_task_order(matrix_df.index.tolist(), task_reference_order)
+    experiment_labels = [task_to_experiment[task] for task in row_order]
+
+    relabeled_matrix = matrix_df.loc[row_order].copy()
+    relabeled_matrix.index = experiment_labels
+
+    relabeled_annot = None
+    if annot_df is not None:
+        relabeled_annot = annot_df.loc[row_order].copy()
+        relabeled_annot.index = experiment_labels
+
+    return relabeled_matrix, relabeled_annot, row_order
 
 
 def boldify_axes(ax, xlabel=None, ylabel=None, rotate_xticks=35):
@@ -267,12 +210,10 @@ def boldify_axes(ax, xlabel=None, ylabel=None, rotate_xticks=35):
         plt.setp(ax.get_xticklabels(), fontweight="bold")
 
 
-def draw_grouped_legend_panel(
+def draw_labeled_legend_panel(
     ax_leg,
-    task_order,
-    domain_of,
+    label_order,
     palette,
-    domain_order,
     ncols=2,
     fontsize=8,
     markersize=5,
@@ -281,15 +222,10 @@ def draw_grouped_legend_panel(
     ax_leg.set_axis_off()
     ax_leg.set_xlim(0, 1)
     ax_leg.set_ylim(0, 1)
-    items = []
-    for dom in domain_order:
-        ts = [t for t in task_order if domain_of.get(t, "Other") == dom]
-        if not ts:
-            continue
-        items.append(("header", dom))
-        items.extend(("task", t) for t in ts)
+    rows = len(label_order)
+    if rows == 0:
+        return
 
-    rows = len(items)
     rows_per_col = max(1, math.ceil(rows / ncols))
     x_cols = [0.02 + i * (1.0 / ncols) for i in range(ncols)]
     top = 0.98
@@ -297,7 +233,7 @@ def draw_grouped_legend_panel(
 
     col = 0
     row_in_col = 0
-    for kind, val in items:
+    for label in label_order:
         if row_in_col >= rows_per_col:
             col += 1
             row_in_col = 0
@@ -305,24 +241,18 @@ def draw_grouped_legend_panel(
             break
         x = x_cols[col]
         y = top - row_in_col * dy
-        if kind == "header":
-            ax_leg.text(
-                x, y, val, fontsize=fontsize, fontweight="bold", ha="left", va="top"
-            )
-        else:
-            t = val
-            color = palette.get(t, "0.4")
-            ax_leg.plot(
-                [x],
-                [y],
-                marker="o",
-                ms=markersize,
-                mfc=color,
-                mec="#222222",
-                mew=0.8,
-                ls="None",
-            )
-            ax_leg.text(x + colpad, y, t, fontsize=fontsize, ha="left", va="center")
+        color = palette.get(label, "0.4")
+        ax_leg.plot(
+            [x],
+            [y],
+            marker="o",
+            ms=markersize,
+            mfc=color,
+            mec="#222222",
+            mew=0.8,
+            ls="None",
+        )
+        ax_leg.text(x + colpad, y, label, fontsize=fontsize, ha="left", va="center")
         row_in_col += 1
 
 
@@ -413,112 +343,6 @@ def clip(v):
         )
 
 
-def wrap_domain(dom: str, max_len: int = 20) -> str:
-    # First, break on the preferred delimiters
-    s = dom.replace(" & ", " &\n").replace(", ", ",\n")
-    out = []
-    for seg in s.splitlines():
-        # Then wrap remaining long segments on spaces (no hard splits)
-        wrapped = textwrap.wrap(
-            seg, width=max_len, break_long_words=False, break_on_hyphens=True
-        )
-        out.extend(wrapped if wrapped else [""])
-    return "\n".join(out)
-
-
-def add_domains_between_ylabel_and_ticks(
-    ax,
-    row_order,
-    task_to_domain,
-    label_rotation=30,
-    tick_pad_pts=28,
-    ylabel_pad_pts=60,
-    domain_x_frac=-0.11,  # x position for the domain column (axes frac)
-    left_extend_frac=0.02,  # how far past the text the line extends
-    label_x_offset_frac=0.008,  # small nudge right from domain_x_frac
-    label_align="left",  # "left" | "center" | "right"
-    label_kw=None,
-    sep_kw=None,
-):
-    if label_kw is None:
-        label_kw = dict(
-            fontsize=10, fontweight="bold", color="#222", ha="left", va="center"
-        )  # default to left
-    else:
-        # override HA with requested alignment but keep user's other styles
-        label_kw = {
-            **label_kw,
-            "ha": {"left": "left", "center": "center", "right": "right"}[label_align],
-            "va": "center",
-        }
-    if sep_kw is None:
-        sep_kw = dict(color="#777", lw=1.0, alpha=0.9)
-
-    if not row_order:
-        return
-
-    ax.tick_params(axis="y", pad=tick_pad_pts)
-    ax.yaxis.labelpad = ylabel_pad_pts
-
-    # row centers (as before) ...
-    yticks = ax.get_yticks()
-    yticklabs = [t.get_text() for t in ax.get_yticklabels()]
-    if yticklabs and len(yticklabs) == len(row_order):
-        lbl2y = {lab: y for lab, y in zip(yticklabs, yticks)}
-        y_centers = [lbl2y.get(t, np.nan) for t in row_order]
-    else:
-        n = len(row_order)
-        y0, y1 = ax.get_ylim()
-        base = np.linspace(0.5, n - 0.5, n)
-        if y1 < y0:
-            scale = (y0 - y1) / (n - 1)
-            y_centers = y0 - (base - 0.5) * scale
-        else:
-            scale = (y1 - y0) / (n - 1)
-            y_centers = y0 + (base - 0.5) * scale
-
-    doms = [task_to_domain.get(t, "Other") for t in row_order]
-    blocks = []
-    start = 0
-    for i in range(1, len(doms)):
-        if doms[i] != doms[i - 1]:
-            blocks.append((doms[start], start, i - 1))
-            start = i
-    if len(doms):
-        blocks.append((doms[start], start, len(doms) - 1))
-
-    trans_text = mtransforms.blended_transform_factory(ax.transAxes, ax.transData)
-    for dom, i0, i1 in blocks:
-        y_block = float(np.nanmean(y_centers[i0 : i1 + 1]))
-        # left-aligned text slightly to the right of the domain column anchor
-        x_text = domain_x_frac + (label_x_offset_frac if label_align == "left" else 0.0)
-        dom_updated = wrap_domain(dom, max_len=24)
-        ax.text(
-            x_text,
-            y_block,
-            dom_updated,
-            rotation=label_rotation,
-            transform=trans_text,
-            clip_on=False,
-            **label_kw,
-        )
-
-    # separators (heatmap + extension into the domain column)
-    x_min, x_max = ax.get_xlim()
-    trans_sep = mtransforms.blended_transform_factory(ax.transAxes, ax.transData)
-    for i in range(len(doms) - 1):
-        if doms[i + 1] != doms[i]:
-            y_sep = 0.5 * (y_centers[i] + y_centers[i + 1])
-            ax.hlines(y_sep, x_min, x_max, **sep_kw)  # inside heatmap
-            ax.plot(
-                [0.0, domain_x_frac - left_extend_frac],
-                [y_sep, y_sep],
-                transform=trans_sep,
-                clip_on=False,
-                **sep_kw,
-            )  # into domain column
-
-
 ###################### task_timing_stats ######################
 
 
diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index ae5fef1..7f9b9a5 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -11,20 +11,16 @@
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
-    add_domains_between_ylabel_and_ticks,
     boldify_axes,
-    build_task_order_and_palette,
-    domain_sorted_rows,
-    draw_grouped_legend_panel,
-    get_cog_domain_info,
+    build_experiment_display_info,
+    draw_labeled_legend_panel,
+    relabel_heatmap_rows,
     savefig_pub,
     setup_pub_style,
-    task_domain_real,
-    task_domain_simul,
 )
 
-level = "group_lvl"
-keys_not_to_include = [
+LEVEL = "group_lvl"
+KEYS_NOT_TO_INCLUDE = [
     "Logistic regression permutation p_value",
     "Logistic regression permutation score mean",
     "Logistic regression permutation score std",
@@ -32,509 +28,532 @@
     "SVM permutation score mean",
     "SVM permutation score std",
 ]
+GROUP = "test"
+TARGETS = [
+    ("PCA", "Logistic regression balanced accuracy"),
+    ("PLS", "Logistic regression balanced accuracy"),
+    ("PCA", "SVM balanced accuracy"),
+    ("PLS", "SVM balanced accuracy"),
+    ("PCA", "SI"),
+    ("PLS", "SI"),
+]
 
-#######################################################################################
 
-if __name__ == "__main__":
-    # argparse
-    HELPTEXT = """
+def parse_args():
+    helptext = """
     Script to make figures/tables from multi-dataset ML results.
     """
-
-    setup_pub_style()
-    parser = argparse.ArgumentParser(description=HELPTEXT)
-
+    parser = argparse.ArgumentParser(description=helptext)
     parser.add_argument(
         "--multi_dataset_info", type=str, help="path to multi-dataset info file"
     )
     parser.add_argument(
         "--simul_or_real", type=str, help="Specify 'simulated' or 'real' data"
     )
+    return parser.parse_args()
 
-    args = parser.parse_args()
 
-    multi_dataset_info = args.multi_dataset_info
-    simul_or_real = args.simul_or_real
+def read_json(json_file):
+    with open(json_file, "r") as f:
+        return json.load(f)
 
-    # Read dataset info
-    with open(multi_dataset_info, "r") as f:
-        multi_dataset_info = json.load(f)
 
+def get_analysis_config(multi_dataset_info, simul_or_real):
     if simul_or_real == "real":
-        main_root = multi_dataset_info["real_data"]["main_root"]
-        DATASETS = multi_dataset_info["real_data"]["DATASETS"]
-        TASKS_to_include = multi_dataset_info["real_data"]["TASKS_to_include"]
-    elif simul_or_real == "simulated":
-        main_root = multi_dataset_info["simulated_data"]["main_root"]
-        DATASETS = multi_dataset_info["simulated_data"]["DATASETS"]
-        TASKS_to_include = multi_dataset_info["simulated_data"]["TASKS_to_include"]
+        return multi_dataset_info["real_data"]
+    if simul_or_real == "simulated":
+        return multi_dataset_info["simulated_data"]
+    raise ValueError(f"Invalid simul_or_real: {simul_or_real}")
 
-    output_root = f"{multi_dataset_info['output_root']}/ML_results"
 
-    ALL_ML_SCORES = None
-    for dataset in DATASETS:
+def get_classification_input_dir(ml_root, dataset_info):
+    sessions = dataset_info.get("SESSIONS") or [None]
+    session = sessions[0]
+    if session is None:
+        return f"{ml_root}/classification"
+    return f"{ml_root}/classification/{session}"
+
+
+def filter_ml_scores(ml_scores_new, tasks_to_include):
+    filtered_scores = {
+        key: [] for key in ml_scores_new[LEVEL].keys() if key not in KEYS_NOT_TO_INCLUDE
+    }
+
+    for index, task in enumerate(ml_scores_new[LEVEL]["task"]):
+        if task not in tasks_to_include:
+            continue
+        for key in filtered_scores:
+            filtered_scores[key].append(ml_scores_new[LEVEL][key][index])
+
+    return filtered_scores
+
+
+def merge_ml_scores(all_ml_scores, ml_scores_new_updated):
+    if all_ml_scores is None:
+        return ml_scores_new_updated
+
+    for key, values in ml_scores_new_updated.items():
+        if key in all_ml_scores:
+            all_ml_scores[key].extend(values)
+    return all_ml_scores
+
+
+def collect_all_ml_scores(main_root, datasets, tasks_to_include):
+    all_ml_scores = None
+
+    for dataset in datasets:
         print(f"Processing dataset: {dataset}")
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
-        ML_root = f"{main_root}/{dataset}/derivatives/ML"
-
-        # Read dataset info
-        with open(dataset_info_file, "r") as f:
-            dataset_info = json.load(f)
-
-        if "SESSIONS" in dataset_info:
-            SESSIONS = dataset_info["SESSIONS"]
-        else:
-            SESSIONS = None
-        if SESSIONS is None:
-            SESSIONS = [None]
-
-        TASKS = dataset_info["TASKS"]
-
-        if "RUNS" in dataset_info:
-            RUNS = dataset_info["RUNS"]
-        else:
-            RUNS = None
-        if RUNS is None:
-            RUNS = {task: [None] for task in TASKS}
-
-        # find all ML_scores_classify_dFC-id.npy in the ML_root/classfication/ folder
-        # for now we will only use the first session
-        session = SESSIONS[0]
-        if session is None:
-            input_dir = f"{ML_root}/classification"
-        else:
-            input_dir = f"{ML_root}/classification/{session}"
+        ml_root = f"{main_root}/{dataset}/derivatives/ML"
+        dataset_info = read_json(dataset_info_file)
+        input_dir = get_classification_input_dir(ml_root, dataset_info)
+
         if not os.path.exists(input_dir):
             print(
                 f"Input directory {input_dir} does not exist. Skipping dataset {dataset}."
             )
             continue
-        ALL_ML_SCORES_FILES = os.listdir(input_dir)
-        ALL_ML_SCORES_FILES = [
-            f for f in ALL_ML_SCORES_FILES if "ML_scores_classify_" in f
+
+        all_ml_scores_files = [
+            filename
+            for filename in os.listdir(input_dir)
+            if "ML_scores_classify_" in filename
         ]
-        for f in ALL_ML_SCORES_FILES:
+
+        for filename in all_ml_scores_files:
             try:
-                ML_scores_new = np.load(f"{input_dir}/{f}", allow_pickle=True).item()
-                # ML_scores_new_updated is a new dictionary with same keys as ML_scores_new but empty lists
-                ML_scores_new_updated = {
-                    key: []
-                    for key in ML_scores_new[level].keys()
-                    if key not in keys_not_to_include
-                }
-                for i in range(len(ML_scores_new[level]["task"])):
-                    if ML_scores_new[level]["task"][i] not in TASKS_to_include:
-                        continue
-
-                    for key in ML_scores_new_updated.keys():
-                        ML_scores_new_updated[key].append(ML_scores_new[level][key][i])
-
-                if ALL_ML_SCORES is None:
-                    ALL_ML_SCORES = ML_scores_new_updated
-                else:
-                    for key in ML_scores_new_updated.keys():
-                        if key in ALL_ML_SCORES:
-                            ALL_ML_SCORES[key].extend(ML_scores_new_updated[key])
-            except Exception as e:
-                print(f"Error loading {f}: {e}")
+                ml_scores_new = np.load(
+                    f"{input_dir}/{filename}", allow_pickle=True
+                ).item()
+                filtered_scores = filter_ml_scores(ml_scores_new, tasks_to_include)
+                all_ml_scores = merge_ml_scores(all_ml_scores, filtered_scores)
+            except Exception as error:
+                print(f"Error loading {filename}: {error}")
                 continue
 
-    # check that the lists in all keys have the same length
-    if ALL_ML_SCORES is not None:
-        lengths = [len(v) for v in ALL_ML_SCORES.values()]
-        if len(set(lengths)) != 1:
-            print(
-                f"Warning: Not all keys have the same length in ALL_ML_SCORES. key and length pairs: {dict(zip(ALL_ML_SCORES.keys(), lengths))}"
-            )
-
-    # save ALL_ML_SCORES
-    if not os.path.exists(output_root):
-        os.makedirs(output_root)
-    np.save(f"{output_root}/ALL_ML_SCORES_{simul_or_real}.npy", ALL_ML_SCORES)
-
-    # ===== Plotting =====
-    DOMAIN_ORDER, TASK2DOMAIN, DOMAIN_BASE = get_cog_domain_info(simul_or_real)
-    # knobs
-    GROUP = "test"
-    TARGETS = [
-        ("PCA", "Logistic regression balanced accuracy"),
-        ("PLS", "Logistic regression balanced accuracy"),
-        ("PCA", "SVM balanced accuracy"),
-        ("PLS", "SVM balanced accuracy"),
-        ("PCA", "SI"),
-        ("PLS", "SI"),
-    ]
-    # -------------------------------------------------------------------
+    return all_ml_scores
 
-    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})
-    sns.set_style("darkgrid")
 
-    AGG_FUNC = np.median  # across-run aggregation
+def validate_score_lengths(all_ml_scores):
+    if all_ml_scores is None:
+        return
 
-    for embedding, metric in TARGETS:
-        # ---- slice ----
-        df = pd.DataFrame.from_dict(ALL_ML_SCORES)
-        df = df[df["task"].isin(TASKS_to_include)]
-        df = df[(df["embedding"] == embedding) & (df["group"] == GROUP)]
-
-        # methods in alphabetical order (your current rule)
-        method_order = sorted(df["dFC method"].unique(), key=lambda s: s.lower())
-        df["dFC method"] = pd.Categorical(
-            df["dFC method"], categories=method_order, ordered=True
+    lengths = [len(values) for values in all_ml_scores.values()]
+    if len(set(lengths)) != 1:
+        print(
+            "Warning: Not all keys have the same length in ALL_ML_SCORES. "
+            f"key and length pairs: {dict(zip(all_ml_scores.keys(), lengths))}"
         )
 
-        # --- domain tagging & task ordering/coloring (only for real data) ---
-        if simul_or_real == "real":
-            df["domain"] = df["task"].map(task_domain_real)
-        elif simul_or_real == "simulated":
-            df["domain"] = df["task"].map(task_domain_simul)
-        # Use tasks present in THIS slice
-        task_order, task_palette = build_task_order_and_palette(
+
+def save_all_ml_scores(all_ml_scores, output_root, simul_or_real):
+    os.makedirs(output_root, exist_ok=True)
+    np.save(f"{output_root}/ALL_ML_SCORES_{simul_or_real}.npy", all_ml_scores)
+
+
+def prepare_metric_dataframe(
+    all_ml_scores, tasks_to_include, embedding, metric, simul_or_real
+):
+    df = pd.DataFrame.from_dict(all_ml_scores)
+    df = df[df["task"].isin(tasks_to_include)]
+    df = df[(df["embedding"] == embedding) & (df["group"] == GROUP)].copy()
+
+    method_order = sorted(df["dFC method"].unique(), key=lambda method: method.lower())
+    df["dFC method"] = pd.Categorical(
+        df["dFC method"], categories=method_order, ordered=True
+    )
+
+    task_order, task_to_experiment, experiment_order, experiment_palette = (
+        build_experiment_display_info(
             df["task"].unique(),
+            task_reference_order=tasks_to_include,
             simul_or_real=simul_or_real,
-            similarity_L=0.05,
-            similarity_S=0.04,
         )
+    )
+    df["experiment"] = df["task"].map(task_to_experiment)
+
+    return (
+        df,
+        method_order,
+        task_order,
+        task_to_experiment,
+        experiment_order,
+        experiment_palette,
+    )
 
-        # ===== build BEST and ACROSS tables =====
-        counts_task = df.groupby("task")["run"].nunique()
-        multi_tasks = counts_task[counts_task > 1].index
-        df_multi = df[
-            df["task"].isin(multi_tasks)
-        ]  # <- use this dataframe for ACROSS figures
-
-        # ACROSS heatmap (aggregate then pivot):
-        if not df_multi.empty:
-            df_across = (
-                df_multi.groupby(["task", "dFC method"], observed=True)[metric]
-                .agg(score=AGG_FUNC)
-                .reset_index()
-            )
 
-        # BEST: one row per (task, method) with the winning run kept
-        df_best = (
-            df.sort_values(["task", "dFC method", metric], ascending=[True, True, False])
-            .drop_duplicates(subset=["task", "dFC method"], keep="first")
-            .rename(columns={metric: "score"})
-        )
+def build_best_and_multi_tables(df, metric):
+    counts_task = df.groupby("task")["run"].nunique()
+    multi_tasks = counts_task[counts_task > 1].index
+    df_multi = df[df["task"].isin(multi_tasks)].copy()
+
+    df_best = (
+        df.sort_values(["task", "dFC method", metric], ascending=[True, True, False])
+        .drop_duplicates(subset=["task", "dFC method"], keep="first")
+        .rename(columns={metric: "score"})
+    )
+
+    return df_best, df_multi
+
+
+def get_pointplot_limits(metric):
+    if metric == "SI":
+        return -1.0, 1.0
+    return 0.5, 1.0
+
+
+def get_heatmap_limits(metric):
+    if metric == "SI":
+        return None, 1.0, 0.0
+    return 0.5 - 1e-6, 1.0, 0.5
+
+
+def style_boxplot(ax, box_edge):
+    for artist in ax.artists:
+        artist.set_edgecolor(box_edge)
+        facecolor = artist.get_facecolor()
+        artist.set_facecolor((facecolor[0], facecolor[1], facecolor[2], 0.12))
+    for line in ax.lines:
+        line.set_color(box_edge)
+        line.set_alpha(0.5)
+        line.set_zorder(1)
+
 
-        # ----------- POINTPLOT (BEST) -----------
-        # 1) Make a 2-panel figure: left=plot, right=legend
-        fig = plt.figure(figsize=(max(10, 0.6 * len(method_order)) + 5.0, 7.0))
-        gs = fig.add_gridspec(ncols=2, nrows=1, width_ratios=[1.0, 0.5], wspace=0.05)
-        ax = fig.add_subplot(gs[0, 0])
-        ax_leg = fig.add_subplot(gs[0, 1])  # empty panel for the legend
-
-        # --- BACKGROUND: semi-transparent boxplot across tasks (per method) ---
-        # one value per (task, method): use df_best['score']
-        box_face = to_rgba("#DE9995", 0.18)  # neutral gray, ~18% opacity
-        box_edge = "#730800"
-
-        sns.boxplot(
-            data=df_best,
-            x="dFC method",
-            y="score",
-            order=method_order,
-            whis=(
-                5,
-                95,
-            ),  # <- 5th–95th percentile whiskers (change to "range", 1.5, etc. if you prefer)
-            fliersize=0,  # hide outlier dots (keeps background clean)
-            linewidth=1.0,
-            width=0.2,  # narrower than default so points are visible
-            color=box_face,  # face color (we’ll also set edge color below)
-            ax=ax,
-            zorder=1,
+def overlay_method_means(ax, df_best, lower, upper):
+    means = df_best.groupby("dFC method", observed=True)["score"].mean()
+    xticks = ax.get_xticks()
+    xticklabels = [tick.get_text() for tick in ax.get_xticklabels()]
+    x_positions = {label: xticks[index] for index, label in enumerate(xticklabels)}
+
+    halfwidth = 0.1
+    for method, mean_score in means.items():
+        if method not in x_positions or pd.isna(mean_score):
+            continue
+        mean_score = min(upper, max(lower, mean_score))
+        x_position = x_positions[method]
+        ax.hlines(
+            mean_score,
+            x_position - halfwidth,
+            x_position + halfwidth,
+            colors="#050505",
+            lw=2.4,
+            zorder=3,
         )
-        # ensure edges are visible but subtle; also enforce alpha on faces
-        for artist in ax.artists:
-            artist.set_edgecolor(box_edge)
-            fc = artist.get_facecolor()
-            artist.set_facecolor((fc[0], fc[1], fc[2], 0.12))  # set alpha explicitly
-        for line in ax.lines:  # whiskers/medians/caps
-            line.set_color(box_edge)
-            line.set_alpha(0.5)
-            line.set_zorder(1)
-
-        # --- OVERLAY: method mean across tasks (black horizontal line) ---
-        # (This is separate from the boxplot's median; gives an easy mean comparison)
-        means = df_best.groupby("dFC method", observed=True)["score"].mean()
-        xticks = ax.get_xticks()
-        xlabs = [t.get_text() for t in ax.get_xticklabels()]
-        xpos = {lab: xticks[i] for i, lab in enumerate(xlabs)}
-
-        # bounds (SI vs BA)
-        if metric == "SI":
-            lower, upper = -1.0, 1.0
-        else:
-            lower, upper = 0.5, 1.0
-
-        halfwidth = 0.1  # how wide the mean bar is around each tick
-        for meth, m in means.items():
-            if meth in xpos and pd.notna(m):
-                m = min(upper, max(lower, m))  # clip to metric range
-                x = xpos[meth]
-                ax.hlines(
-                    m, x - halfwidth, x + halfwidth, colors="#050505", lw=2.4, zorder=3
-                )
-
-        # --- FOREGROUND: your existing per-task pointplot (on top) ---
-        sns.pointplot(
-            data=df_best,
-            x="dFC method",
-            y="score",
-            hue="task",
-            order=method_order,
-            hue_order=task_order,
-            dodge=0.4,
-            errorbar=None,
-            linestyles="",
-            markers="o",
-            palette=task_palette,
-            ax=ax,
-            zorder=6,
+
+
+def finalize_marker_edges(ax):
+    for line in ax.lines:
+        try:
+            line.set_markeredgecolor("#222222")
+            line.set_markeredgewidth(0.8)
+        except Exception:
+            pass
+
+
+def plot_best_pointplot(
+    df_best,
+    method_order,
+    experiment_order,
+    experiment_palette,
+    output_root,
+    embedding,
+    metric,
+    simul_or_real,
+):
+    figure = plt.figure(figsize=(max(10, 0.6 * len(method_order)) + 5.0, 7.0))
+    grid_spec = figure.add_gridspec(
+        ncols=2, nrows=1, width_ratios=[1.0, 0.5], wspace=0.05
+    )
+    ax = figure.add_subplot(grid_spec[0, 0])
+    ax_leg = figure.add_subplot(grid_spec[0, 1])
+
+    box_face = to_rgba("#DE9995", 0.18)
+    box_edge = "#730800"
+
+    sns.boxplot(
+        data=df_best,
+        x="dFC method",
+        y="score",
+        order=method_order,
+        whis=(5, 95),
+        fliersize=0,
+        linewidth=1.0,
+        width=0.2,
+        color=box_face,
+        ax=ax,
+        zorder=1,
+    )
+    style_boxplot(ax, box_edge)
+
+    lower, upper = get_pointplot_limits(metric)
+    overlay_method_means(ax, df_best, lower, upper)
+
+    sns.pointplot(
+        data=df_best,
+        x="dFC method",
+        y="score",
+        hue="experiment",
+        order=method_order,
+        hue_order=experiment_order,
+        dodge=0.4,
+        errorbar=None,
+        linestyles="",
+        markers="o",
+        palette=experiment_palette,
+        ax=ax,
+        zorder=6,
+    )
+    finalize_marker_edges(ax)
+
+    ax.set_xlabel("dFC method")
+    ax.set_ylabel(metric)
+    if metric == "SI":
+        ax.set_ylim(top=1.02)
+    else:
+        ax.set_ylim(0.48, 1.02)
+    ax.grid(True, axis="y", alpha=0.25)
+    sns.despine(ax=ax, top=True, right=True)
+    plt.setp(ax.get_xticklabels(), rotation=35, ha="right")
+
+    if ax.legend_:
+        ax.legend_.remove()
+
+    draw_labeled_legend_panel(
+        ax_leg,
+        experiment_order,
+        experiment_palette,
+        ncols=2 if simul_or_real == "real" else 1,
+        fontsize=8,
+        markersize=5,
+    )
+    ax_leg.set_title("Experiment", fontsize=9, pad=4, fontweight="bold")
+    box = ax_leg.get_position()
+    ax_leg.set_position([box.x0, box.y0 - 0.03, box.width, box.height])
+
+    boldify_axes(ax, xlabel="dFC method", ylabel=metric)
+    figure.tight_layout(rect=[0.02, 0.02, 0.98, 0.98])
+
+    savefig_pub(
+        f"{output_root}/ML_scores_{embedding}_{metric}_{LEVEL}_{simul_or_real}_best.png"
+    )
+    plt.close(figure)
+
+
+def plot_best_heatmap(
+    df_best,
+    method_order,
+    task_order,
+    task_to_experiment,
+    output_root,
+    embedding,
+    metric,
+    simul_or_real,
+):
+    matrix_best = df_best.pivot(index="task", columns="dFC method", values="score")
+    annot_best = df_best.assign(
+        label=lambda df_plot: df_plot["score"].map(lambda value: f"{value:.2f}")
+    ).pivot(index="task", columns="dFC method", values="label")
+
+    matrix_best, annot_best, _ = relabel_heatmap_rows(
+        matrix_best,
+        annot_best,
+        task_reference_order=task_order,
+        task_to_experiment=task_to_experiment,
+    )
+    col_order = [method for method in method_order if method in matrix_best.columns]
+
+    if simul_or_real == "real":
+        width = max(10, 0.65 * len(col_order))
+        height = max(6.0, 0.30 * len(matrix_best.index))
+    else:
+        width = max(11, 11 / 7 * len(col_order))
+        height = max(7.0, 0.35 * len(matrix_best.index))
+
+    figure, ax = plt.subplots(figsize=(width, height))
+    vmin, vmax, center = get_heatmap_limits(metric)
+    heatmap = sns.heatmap(
+        matrix_best.loc[:, col_order],
+        vmin=vmin,
+        vmax=vmax,
+        center=center,
+        cmap="coolwarm",
+        annot=annot_best.loc[:, col_order],
+        fmt="",
+        annot_kws={"fontsize": 9, "fontweight": "bold", "linespacing": 1.15},
+        cbar_kws={"shrink": 0.7, "pad": 0.02},
+        ax=ax,
+    )
+    colorbar = heatmap.collections[0].colorbar
+    colorbar.set_label(metric, fontsize=10, fontweight="bold")
+    colorbar.ax.tick_params(labelsize=9)
+
+    boldify_axes(ax, xlabel="dFC method", ylabel="Experiment", rotate_xticks=35)
+    ax.set_xlabel("dFC method")
+    ax.set_ylabel("Experiment")
+    plt.setp(ax.get_xticklabels(), fontweight="bold", rotation=35, ha="right")
+    plt.setp(ax.get_yticklabels(), fontweight="bold")
+    sns.despine(ax=ax, top=True, right=True)
+    plt.tight_layout()
+    savefig_pub(
+        f"{output_root}/ML_scores_heatmap_{embedding}_{metric}_{LEVEL}_{simul_or_real}_best.png"
+    )
+    plt.close(figure)
+
+
+def build_across_heatmap_data(df_multi, metric, task_order, task_to_experiment):
+    summary = (
+        df_multi.groupby(["task", "dFC method"], observed=True)[metric]
+        .agg(n="count", med="median", vmin="min", vmax="max")
+        .reset_index()
+    )
+
+    matrix_across = summary.pivot(index="task", columns="dFC method", values="med")
+    annot_across = summary.assign(
+        label=lambda df_plot: df_plot["vmin"].map(lambda value: f"{value:.2f}")
+        + "\u2013"
+        + df_plot["vmax"].map(lambda value: f"{value:.2f}")
+        + "\n"
+        + df_plot["n"].map(lambda value: f"n={value}")
+    ).pivot(index="task", columns="dFC method", values="label")
+
+    return relabel_heatmap_rows(
+        matrix_across,
+        annot_across,
+        task_reference_order=task_order,
+        task_to_experiment=task_to_experiment,
+    )
+
+
+def plot_across_heatmap(
+    df_multi,
+    method_order,
+    task_order,
+    task_to_experiment,
+    output_root,
+    embedding,
+    metric,
+    simul_or_real,
+):
+    if df_multi.empty:
+        print(
+            f"[ACROSS-RUN] No tasks with ≥2 runs for {embedding} / {metric} — skipping across-run figures."
         )
+        return
 
-        # optional: crisp marker edges
-        for line in ax.lines:
-            try:
-                line.set_markeredgecolor("#222222")
-                line.set_markeredgewidth(0.8)
-            except Exception:
-                pass
-
-        ax.set_xlabel("dFC method")
-        ax.set_ylabel(metric)
-        if metric == "SI":
-            ax.set_ylim(top=1.02)
-        else:
-            ax.set_ylim(0.48, 1.02)
-        ax.grid(True, axis="y", alpha=0.25)
-        sns.despine(ax=ax, top=True, right=True)
-        plt.setp(ax.get_xticklabels(), rotation=35, ha="right")
-
-        # kill any in-axes legend and draw grouped legend in the right panel
-        if ax.legend_:
-            ax.legend_.remove()
-        if simul_or_real == "real":
-            domain_of = {t: task_domain_real(t) for t in task_order}
-            draw_grouped_legend_panel(
-                ax_leg,
-                task_order,
-                domain_of,
-                task_palette,
-                DOMAIN_ORDER,
-                ncols=2,
-                fontsize=8,
-                markersize=5,
-            )
-            ax_leg.set_title("Task Paradigm", fontsize=9, pad=4, fontweight="bold")
-        elif simul_or_real == "simulated":
-            domain_of = {t: task_domain_simul(t) for t in task_order}
-            draw_grouped_legend_panel(
-                ax_leg,
-                task_order,
-                domain_of,
-                task_palette,
-                DOMAIN_ORDER,
-                ncols=1,
-                fontsize=8,
-                markersize=5,
-            )
-            ax_leg.set_title("Task Paradigm", fontsize=9, pad=4, fontweight="bold")
+    matrix_across, annot_across, _ = build_across_heatmap_data(
+        df_multi,
+        metric,
+        task_order,
+        task_to_experiment,
+    )
+    col_order = [method for method in method_order if method in matrix_across.columns]
+    width = max(9.0, 11 / 7 * len(col_order))
+    height = max(7.0, 7 / 20 * len(matrix_across.index))
+
+    figure, ax = plt.subplots(figsize=(width, height))
+    vmin, vmax, center = get_heatmap_limits(metric)
+    heatmap = sns.heatmap(
+        matrix_across.loc[:, col_order],
+        vmin=vmin,
+        vmax=vmax,
+        center=center,
+        cmap="coolwarm",
+        annot=annot_across.loc[:, col_order],
+        fmt="",
+        annot_kws={"fontsize": 9, "fontweight": "bold", "linespacing": 1.15},
+        cbar_kws={"shrink": 0.7, "pad": 0.02},
+        ax=ax,
+    )
 
-        box = ax_leg.get_position()
-        ax_leg.set_position(
-            [box.x0, box.y0 - 0.03, box.width, box.height]
-        )  # move down by ~3% fig height
+    colorbar = heatmap.collections[0].colorbar
+    colorbar.set_label(metric, fontsize=10, fontweight="bold")
+    boldify_axes(ax, xlabel="dFC method", ylabel="Experiment", rotate_xticks=35)
+    ax.set_xlabel("dFC method")
+    ax.set_ylabel("Experiment")
+    plt.setp(ax.get_xticklabels(), fontweight="bold", rotation=35, ha="right")
+    plt.setp(ax.get_yticklabels(), fontweight="bold")
+    sns.despine(ax=ax, top=True, right=True)
+    plt.tight_layout()
+    savefig_pub(
+        f"{output_root}/ML_scores_heatmap_{embedding}_{metric}_{LEVEL}_{simul_or_real}_across.png"
+    )
+    plt.close(figure)
 
-        boldify_axes(ax, xlabel="dFC method", ylabel=metric)
 
-        # IMPORTANT: don't call a plain tight_layout() now; the GridSpec already allocates space.
-        # If you must, keep a small margin:
-        fig.tight_layout(rect=[0.02, 0.02, 0.98, 0.98])
+def generate_all_plots(all_ml_scores, tasks_to_include, output_root, simul_or_real):
+    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})
+    sns.set_style("darkgrid")
 
-        savefig_pub(
-            f"{output_root}/ML_scores_{embedding}_{metric}_{level}_{simul_or_real}_best.png"
+    for embedding, metric in TARGETS:
+        (
+            df,
+            method_order,
+            task_order,
+            task_to_experiment,
+            experiment_order,
+            experiment_palette,
+        ) = prepare_metric_dataframe(
+            all_ml_scores,
+            tasks_to_include,
+            embedding,
+            metric,
+            simul_or_real,
         )
-        plt.close(fig)
-
-        # ----------- HEATMAPS -----------
-        # BEST heatmap: values from df_best
-        mat_best = df_best.pivot(index="task", columns="dFC method", values="score")
-        row_order = domain_sorted_rows(mat_best.index, TASKS_to_include, simul_or_real)
-        col_order = [m for m in method_order if m in mat_best.columns]
-
-        annot_best = df_best.assign(
-            label=lambda x: x["score"].map(lambda v: f"{v:.2f}")
-        ).pivot(index="task", columns="dFC method", values="label")
-
-        if simul_or_real == "real":
-            w = max(10, 0.65 * len(col_order))
-            h = max(6.0, 0.30 * len(row_order))
-        else:
-            w = max(11, 11 / 7 * len(col_order))
-            h = max(7.0, 0.35 * len(row_order))
-        fig, ax = plt.subplots(figsize=(w, h))
-        vmin, vmax, center = (
-            (None, 1.0, 0.0) if metric == "SI" else (0.5 - 1e-6, 1.0, 0.5)
+        df_best, df_multi = build_best_and_multi_tables(df, metric)
+
+        plot_best_pointplot(
+            df_best,
+            method_order,
+            experiment_order,
+            experiment_palette,
+            output_root,
+            embedding,
+            metric,
+            simul_or_real,
         )
-        hm = sns.heatmap(
-            mat_best.loc[row_order, col_order],
-            vmin=vmin,
-            vmax=vmax,
-            center=center,
-            cmap="coolwarm",
-            annot=annot_best.loc[row_order, col_order],
-            fmt="",
-            annot_kws={"fontsize": 9, "fontweight": "bold", "linespacing": 1.15},
-            cbar_kws={"shrink": 0.7, "pad": 0.02},
-            ax=ax,
+        plot_best_heatmap(
+            df_best,
+            method_order,
+            task_order,
+            task_to_experiment,
+            output_root,
+            embedding,
+            metric,
+            simul_or_real,
         )
-        cbar = hm.collections[0].colorbar
-        cbar.set_label(metric, fontsize=10)
-        cbar.ax.tick_params(labelsize=9)
-        boldify_axes(ax, xlabel="dFC method", ylabel="Task Paradigm", rotate_xticks=35)
-
-        if simul_or_real == "real":
-            task_to_domain = {
-                t: task_domain_real(t) for t in row_order
-            }  # your task_domain helper
-            domain_x_frac = -0.8
-            ylabel_pad_pts = 130
-        elif simul_or_real == "simulated":
-            task_to_domain = {
-                t: task_domain_simul(t) for t in row_order
-            }  # your task_domain helper
-            domain_x_frac = -1.0
-            ylabel_pad_pts = 110
-        add_domains_between_ylabel_and_ticks(
-            ax,
-            row_order=row_order,
-            task_to_domain=task_to_domain,
-            label_rotation=0,  # try 0, 20, 30, or 45
-            tick_pad_pts=0,  # pushes tick labels to the right
-            ylabel_pad_pts=ylabel_pad_pts,  # moves y-axis label left
-            domain_x_frac=domain_x_frac,  # where domain column sits (more negative = further left)
-            left_extend_frac=0.01,  # extend the line a bit further left than the text
-            label_x_offset_frac=0.010,  # nudge text right from the anchor
-            label_align="left",  # <<< left-align labels
-            label_kw=dict(
-                fontsize=9, fontweight="bold", color="#222", ha="center", va="center"
-            ),
-            sep_kw=dict(color="#777", lw=1.0, alpha=0.9),
+        plot_across_heatmap(
+            df_multi,
+            method_order,
+            task_order,
+            task_to_experiment,
+            output_root,
+            embedding,
+            metric,
+            simul_or_real,
         )
 
-        # Bold colorbar label (metric name) too:
-        cbar.set_label(metric, fontsize=10, fontweight="bold")
-
-        ax.set_xlabel("dFC method")
-        ax.set_ylabel("Task Paradigm")
-        # ax.set_title(f"Best across runs • {embedding} • {metric}", pad=8)
-        ax.tick_params(axis="x", labelrotation=35, labelsize=9)
-        plt.setp(ax.get_xticklabels(), fontweight="bold", rotation=35, ha="right")
-        plt.setp(ax.get_yticklabels(), fontweight="bold")
-        sns.despine(ax=ax, top=True, right=True)
-        plt.tight_layout()
-        savefig_pub(
-            f"{output_root}/ML_scores_heatmap_{embedding}_{metric}_{level}_{simul_or_real}_best.png"
-        )
-        plt.close(fig)
 
-        # ACROSS heatmap: color = median; annotation = min–max & n (across runs)
-        if df_multi.empty:
-            print(
-                f"[ACROSS-RUN] No tasks with ≥2 runs for {embedding} / {metric} — skipping across-run figures."
-            )
-        else:
-            # aggregate across runs
-            s = (
-                df_multi.groupby(["task", "dFC method"], observed=True)[metric]
-                .agg(n="count", med="median", vmin="min", vmax="max")
-                .reset_index()
-            )
+def main():
+    args = parse_args()
+    setup_pub_style()
 
-            # heatmap scaling (avoid name clash with s['vmin'] / s['vmax'])
-            if metric == "SI":
-                cmin, cmax, ccenter = None, 1.0, 0.0  # SI in [-1,1], center at 0
-            else:
-                cmin, cmax, ccenter = (
-                    0.5 - 1e-6,
-                    1.0,
-                    0.5,
-                )  # accuracy in [0.5,1], center at chance
-
-            # pivots
-            mat_across = s.pivot(index="task", columns="dFC method", values="med")
-            ann_text = s.assign(
-                label=lambda d: d["vmin"].map(lambda v: f"{v:.2f}")
-                + "\u2013"
-                + d["vmax"].map(lambda v: f"{v:.2f}")
-                + "\n"
-                + d["n"].map(lambda n: f"n={n}")
-            ).pivot(index="task", columns="dFC method", values="label")
-
-            # order
-            row_order = domain_sorted_rows(
-                mat_across.index, TASKS_to_include, simul_or_real
-            )
-            col_order = [m for m in method_order if m in mat_across.columns]
-
-            # plot
-            w = max(9.0, 11 / 7 * len(col_order))
-            h = max(7.0, 7 / 20 * len(row_order))
-            fig, ax = plt.subplots(figsize=(w, h))
-            hm = sns.heatmap(
-                mat_across.loc[row_order, col_order],
-                vmin=cmin,
-                vmax=cmax,
-                center=ccenter,
-                cmap="coolwarm",
-                annot=ann_text.loc[row_order, col_order],
-                fmt="",
-                annot_kws={"fontsize": 9, "fontweight": "bold", "linespacing": 1.15},
-                cbar_kws={"shrink": 0.7, "pad": 0.02},
-                ax=ax,
-            )
+    multi_dataset_info = read_json(args.multi_dataset_info)
+    analysis_config = get_analysis_config(multi_dataset_info, args.simul_or_real)
+    main_root = analysis_config["main_root"]
+    datasets = analysis_config["DATASETS"]
+    tasks_to_include = analysis_config["TASKS_to_include"]
+    output_root = f"{multi_dataset_info['output_root']}/ML_results"
 
-            # domain sidebar & separators (your helper)
-            if simul_or_real == "real":
-                task_to_domain = {t: task_domain_real(t) for t in row_order}
-                domain_x_frac = -0.5
-                ylabel_pad_pts = 160
-            else:  # "simulated"
-                task_to_domain = {t: task_domain_simul(t) for t in row_order}
-                domain_x_frac = -0.6
-                ylabel_pad_pts = 140
-
-            add_domains_between_ylabel_and_ticks(
-                ax,
-                row_order=row_order,
-                task_to_domain=task_to_domain,
-                label_rotation=0,
-                tick_pad_pts=0,
-                ylabel_pad_pts=ylabel_pad_pts,
-                domain_x_frac=domain_x_frac,
-                left_extend_frac=0.01,
-                label_x_offset_frac=0.010,
-                label_align="left",
-                label_kw=dict(
-                    fontsize=9, fontweight="bold", color="#222", ha="center", va="center"
-                ),
-                sep_kw=dict(color="#777", lw=1.0, alpha=0.9),
-            )
+    all_ml_scores = collect_all_ml_scores(main_root, datasets, tasks_to_include)
+    validate_score_lengths(all_ml_scores)
+    save_all_ml_scores(all_ml_scores, output_root, args.simul_or_real)
+    generate_all_plots(
+        all_ml_scores,
+        tasks_to_include,
+        output_root,
+        args.simul_or_real,
+    )
 
-            # cosmetics
-            cbar = hm.collections[0].colorbar
-            cbar.set_label(metric, fontsize=10, fontweight="bold")
-            boldify_axes(
-                ax, xlabel="dFC method", ylabel="Task Paradigm", rotate_xticks=35
-            )
-            plt.setp(ax.get_xticklabels(), fontweight="bold", rotation=35, ha="right")
-            plt.setp(ax.get_yticklabels(), fontweight="bold")
-            sns.despine(ax=ax, top=True, right=True)
-            plt.tight_layout()
-            savefig_pub(
-                f"{output_root}/ML_scores_heatmap_{embedding}_{metric}_{level}_{simul_or_real}_across.png"
-            )
-            plt.close(fig)
+
+if __name__ == "__main__":
+    main()

From 0384a208735499b375f3c40a4654483cebb484a4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 13:18:39 -0400
Subject: [PATCH 356/401] remove legend ml_results

---
 task_dFC/multi_dataset_analysis/ml_results.py | 166 +++++++++++++++---
 1 file changed, 146 insertions(+), 20 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 7f9b9a5..cb7d5e3 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -13,7 +13,6 @@
 from helper_functions import (  # pyright: ignore[reportMissingImports]
     boldify_axes,
     build_experiment_display_info,
-    draw_labeled_legend_panel,
     relabel_heatmap_rows,
     savefig_pub,
     setup_pub_style,
@@ -37,6 +36,9 @@
     ("PCA", "SI"),
     ("PLS", "SI"),
 ]
+TOP_POINT_LABEL_COUNT = 5
+TOP_EXPERIMENT_SHAPES = 3
+TOP_EXPERIMENT_MARKERS = ["*", "^", "D", "P", "X"]
 
 
 def parse_args():
@@ -250,6 +252,128 @@ def finalize_marker_edges(ax):
             pass
 
 
+def _as_rgb_tuple(color_value):
+    rgba = to_rgba(color_value)
+    return np.array([rgba[0], rgba[1], rgba[2]], dtype=float)
+
+
+def extract_pointplot_coordinates(ax, method_order, experiment_order, experiment_palette):
+    target_rgb = {
+        experiment: _as_rgb_tuple(color)
+        for experiment, color in experiment_palette.items()
+    }
+
+    candidate_lines = []
+    for line in ax.lines:
+        x_data = np.asarray(line.get_xdata(), dtype=float)
+        y_data = np.asarray(line.get_ydata(), dtype=float)
+        marker = line.get_marker()
+        if marker in {None, "", "None", " "}:
+            continue
+        if x_data.size != len(method_order) or y_data.size != len(method_order):
+            continue
+        candidate_lines.append(line)
+
+    line_color_distances = []
+    for line in candidate_lines:
+        line_rgb = _as_rgb_tuple(line.get_markerfacecolor())
+        for experiment in experiment_order:
+            dist = float(np.linalg.norm(line_rgb - target_rgb[experiment]))
+            line_color_distances.append((dist, id(line), line, experiment))
+
+    assigned_lines = {}
+    used_experiments = set()
+    used_lines = set()
+    for _, line_id, line, experiment in sorted(line_color_distances, key=lambda x: x[0]):
+        if experiment in used_experiments or line_id in used_lines:
+            continue
+        assigned_lines[experiment] = line
+        used_experiments.add(experiment)
+        used_lines.add(line_id)
+        if len(assigned_lines) == len(experiment_order):
+            break
+
+    coordinates = {}
+    for experiment, line in assigned_lines.items():
+        x_data = np.asarray(line.get_xdata(), dtype=float)
+        y_data = np.asarray(line.get_ydata(), dtype=float)
+        coordinates[experiment] = {}
+        for method_index, method in enumerate(method_order):
+            y_value = y_data[method_index]
+            if np.isnan(y_value):
+                continue
+            coordinates[experiment][method] = (x_data[method_index], y_value)
+    return coordinates
+
+
+def overlay_top_experiment_shapes(
+    ax,
+    df_best,
+    point_coordinates,
+    experiment_palette,
+    top_experiment_shapes,
+):
+    if top_experiment_shapes <= 0:
+        return
+
+    top_experiments = (
+        df_best.groupby("experiment", observed=True)["score"]
+        .mean()
+        .sort_values(ascending=False)
+        .head(top_experiment_shapes)
+        .index.tolist()
+    )
+
+    for rank, experiment in enumerate(top_experiments):
+        if experiment not in point_coordinates:
+            continue
+        marker = TOP_EXPERIMENT_MARKERS[rank % len(TOP_EXPERIMENT_MARKERS)]
+        points = list(point_coordinates[experiment].values())
+        if not points:
+            continue
+        x_vals = [pt[0] for pt in points]
+        y_vals = [pt[1] for pt in points]
+        ax.scatter(
+            x_vals,
+            y_vals,
+            marker=marker,
+            s=65,
+            c=experiment_palette[experiment],
+            edgecolors="#111111",
+            linewidths=0.8,
+            zorder=8,
+        )
+
+
+def annotate_top_scoring_points(ax, df_best, point_coordinates, top_point_label_count):
+    if top_point_label_count <= 0:
+        return
+
+    top_rows = df_best.nlargest(top_point_label_count, columns="score")
+    for _, row in top_rows.iterrows():
+        experiment = row["experiment"]
+        method = row["dFC method"]
+        if experiment not in point_coordinates:
+            continue
+        if method not in point_coordinates[experiment]:
+            continue
+
+        x_value, y_value = point_coordinates[experiment][method]
+        ax.annotate(
+            experiment,
+            xy=(x_value, y_value),
+            xytext=(0, 6),
+            textcoords="offset points",
+            ha="center",
+            va="bottom",
+            fontsize=7,
+            fontweight="bold",
+            color="#1A1A1A",
+            bbox=dict(boxstyle="round,pad=0.14", fc="white", ec="none", alpha=0.75),
+            zorder=9,
+        )
+
+
 def plot_best_pointplot(
     df_best,
     method_order,
@@ -260,12 +384,7 @@ def plot_best_pointplot(
     metric,
     simul_or_real,
 ):
-    figure = plt.figure(figsize=(max(10, 0.6 * len(method_order)) + 5.0, 7.0))
-    grid_spec = figure.add_gridspec(
-        ncols=2, nrows=1, width_ratios=[1.0, 0.5], wspace=0.05
-    )
-    ax = figure.add_subplot(grid_spec[0, 0])
-    ax_leg = figure.add_subplot(grid_spec[0, 1])
+    figure, ax = plt.subplots(figsize=(max(10, 0.6 * len(method_order)), 7.0))
 
     box_face = to_rgba("#DE9995", 0.18)
     box_edge = "#730800"
@@ -304,6 +423,25 @@ def plot_best_pointplot(
         zorder=6,
     )
     finalize_marker_edges(ax)
+    point_coordinates = extract_pointplot_coordinates(
+        ax,
+        method_order,
+        experiment_order,
+        experiment_palette,
+    )
+    overlay_top_experiment_shapes(
+        ax,
+        df_best,
+        point_coordinates,
+        experiment_palette,
+        top_experiment_shapes=TOP_EXPERIMENT_SHAPES,
+    )
+    annotate_top_scoring_points(
+        ax,
+        df_best,
+        point_coordinates,
+        top_point_label_count=TOP_POINT_LABEL_COUNT,
+    )
 
     ax.set_xlabel("dFC method")
     ax.set_ylabel(metric)
@@ -318,20 +456,8 @@ def plot_best_pointplot(
     if ax.legend_:
         ax.legend_.remove()
 
-    draw_labeled_legend_panel(
-        ax_leg,
-        experiment_order,
-        experiment_palette,
-        ncols=2 if simul_or_real == "real" else 1,
-        fontsize=8,
-        markersize=5,
-    )
-    ax_leg.set_title("Experiment", fontsize=9, pad=4, fontweight="bold")
-    box = ax_leg.get_position()
-    ax_leg.set_position([box.x0, box.y0 - 0.03, box.width, box.height])
-
     boldify_axes(ax, xlabel="dFC method", ylabel=metric)
-    figure.tight_layout(rect=[0.02, 0.02, 0.98, 0.98])
+    figure.tight_layout()
 
     savefig_pub(
         f"{output_root}/ML_scores_{embedding}_{metric}_{LEVEL}_{simul_or_real}_best.png"

From 0ae81a41e574a86947dde69bf5a61a4987eaf7bc Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 15:09:19 -0400
Subject: [PATCH 357/401] minor

---
 .../helper_functions.py                       |  84 +++++------
 task_dFC/multi_dataset_analysis/ml_results.py | 136 +++++++++++++-----
 2 files changed, 144 insertions(+), 76 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index cb584c1..2db9dc9 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -61,50 +61,50 @@ def savefig_pub(path_png_or_pdf: str):
 
 DEFAULT_EXPERIMENT_NAME_MAP = {
     "real": {
-        "emotionregulation": "exp1",
-        "audsem": "exp2",
-        "visrhyme": "exp3",
-        "vissem": "exp4",
-        "visspell": "exp5",
-        "arithmetic": "exp6",
-        "stroop": "exp7",
-        "cuedts": "exp8",
-        "axcpt": "exp9",
-        "matching": "exp10",
-        "stern": "exp11",
-        "st": "exp12",
-        "vswm": "exp13",
-        "expo": "exp14",
-        "recall": "exp15",
-        "feedback": "exp16",
-        "ppalocalizer": "exp17",
-        "localiser": "exp18",
-        "localizer": "exp19",
-        "cic": "exp20",
-        "fribbids": "exp21",
-        "risk": "exp22",
-        "itc": "exp23",
-        "fearlearning": "exp24",
-        "paingen": "exp25",
-        "motor": "exp26",
-        "execution": "exp27",
-        "imagery": "exp28",
-        "ihg": "exp29",
+        "emotionregulation": "EXP.18",
+        "audsem": "EXP.3",
+        "visrhyme": "EXP.4",
+        "vissem": "EXP.5",
+        "visspell": "EXP.6",
+        "arithmetic": "EXP.24",
+        "stroop": "EXP.15",
+        "cuedts": "EXP.13",
+        "axcpt": "EXP.12",
+        "matching": "EXP.25",
+        "stern": "EXP.14",
+        "st": "EXP.29",
+        "vswm": "EXP.26",
+        "expo": "EXP.20",
+        "recall": "EXP.21",
+        "feedback": "EXP.22",
+        "ppalocalizer": "EXP.2",
+        "localiser": "EXP.27",
+        "localizer": "EXP.28",
+        "cic": "EXP.11",
+        "fribbids": "EXP.10",
+        "risk": "EXP.9",
+        "itc": "EXP.8",
+        "fearlearning": "EXP.1",
+        "paingen": "EXP.23",
+        "motor": "EXP.19",
+        "execution": "EXP.16",
+        "imagery": "EXP.17",
+        "ihg": "EXP.7",
     },
     "simulated": {
-        "lowfreqlongrest": "exp1",
-        "lowfreqshortrest": "exp2",
-        "lowfreqshorttask": "exp3",
-        "axcpt": "exp4",
-        "stern": "exp5",
-        "cuedts": "exp6",
-        "execution": "exp7",
-        "imagery": "exp8",
-        "localizer": "exp9",
-        "ppalocalizer": "exp10",
-        "itc": "exp11",
-        "stroop": "exp12",
-        "risk": "exp13",
+        "lowfreqlongrest": "EXP.1",
+        "lowfreqshortrest": "EXP.2",
+        "lowfreqshorttask": "EXP.3",
+        "axcpt": "EXP.4",
+        "stern": "EXP.5",
+        "cuedts": "EXP.6",
+        "execution": "EXP.7",
+        "imagery": "EXP.8",
+        "localizer": "EXP.9",
+        "ppalocalizer": "EXP.10",
+        "itc": "EXP.11",
+        "stroop": "EXP.12",
+        "risk": "EXP.13",
     },
 }
 
diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index cb7d5e3..5e4d229 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -36,9 +36,11 @@
     ("PCA", "SI"),
     ("PLS", "SI"),
 ]
-TOP_POINT_LABEL_COUNT = 5
 TOP_EXPERIMENT_SHAPES = 3
 TOP_EXPERIMENT_MARKERS = ["*", "^", "D", "P", "X"]
+COLOR_THRESHOLD = 60.0
+PER_METHOD_LABEL_SCORE_THRESHOLD = 55.0
+NEUTRAL_COLOR = "#555555"
 
 
 def parse_args():
@@ -252,6 +254,23 @@ def finalize_marker_edges(ax):
             pass
 
 
+def get_colored_experiment_mask(df_best, color_threshold=COLOR_THRESHOLD):
+    """Return set of experiments with max score >= color_threshold across all methods."""
+    max_scores = df_best.groupby("experiment", observed=True)["score"].max()
+    return set(max_scores[max_scores >= color_threshold].index)
+
+
+def create_neutral_palette(experiment_order, colored_experiments, vibrant_palette):
+    """Create palette: neutral for non-colored experiments, vibrant for colored ones."""
+    palette = {}
+    for exp in experiment_order:
+        if exp in colored_experiments:
+            palette[exp] = vibrant_palette[exp]
+        else:
+            palette[exp] = NEUTRAL_COLOR
+    return palette
+
+
 def _as_rgb_tuple(color_value):
     rgba = to_rgba(color_value)
     return np.array([rgba[0], rgba[1], rgba[2]], dtype=float)
@@ -310,7 +329,7 @@ def overlay_top_experiment_shapes(
     ax,
     df_best,
     point_coordinates,
-    experiment_palette,
+    shape_palette,
     top_experiment_shapes,
 ):
     if top_experiment_shapes <= 0:
@@ -337,41 +356,74 @@ def overlay_top_experiment_shapes(
             x_vals,
             y_vals,
             marker=marker,
-            s=65,
-            c=experiment_palette[experiment],
+            s=120,
+            c=shape_palette[experiment],
             edgecolors="#111111",
-            linewidths=0.8,
+            linewidths=1.0,
             zorder=8,
         )
 
 
-def annotate_top_scoring_points(ax, df_best, point_coordinates, top_point_label_count):
-    if top_point_label_count <= 0:
-        return
+def annotate_per_method_quartile(
+    ax,
+    df_best,
+    point_coordinates,
+    method_order,
+    score_threshold=PER_METHOD_LABEL_SCORE_THRESHOLD,
+):
+    """
+    For each method, annotate points in the top quartile (>75th percentile)
+    if score > threshold. Position annotation left/right based on point position.
+    """
+    xticks = ax.get_xticks()
+    xticklabels = [t.get_text() for t in ax.get_xticklabels()]
+    method_positions = {lab: xticks[i] for i, lab in enumerate(xticklabels)}
 
-    top_rows = df_best.nlargest(top_point_label_count, columns="score")
-    for _, row in top_rows.iterrows():
-        experiment = row["experiment"]
-        method = row["dFC method"]
-        if experiment not in point_coordinates:
-            continue
-        if method not in point_coordinates[experiment]:
+    for method in method_order:
+        method_df = df_best[df_best["dFC method"] == method]
+        if method_df.empty:
             continue
 
-        x_value, y_value = point_coordinates[experiment][method]
-        ax.annotate(
-            experiment,
-            xy=(x_value, y_value),
-            xytext=(0, 6),
-            textcoords="offset points",
-            ha="center",
-            va="bottom",
-            fontsize=7,
-            fontweight="bold",
-            color="#1A1A1A",
-            bbox=dict(boxstyle="round,pad=0.14", fc="white", ec="none", alpha=0.75),
-            zorder=9,
-        )
+        scores = method_df["score"].values
+        quartile_threshold = np.percentile(scores, 75)
+
+        qualify_rows = method_df[
+            (method_df["score"] > score_threshold)
+            & (method_df["score"] >= quartile_threshold)
+        ]
+
+        method_center = method_positions[method]
+
+        for _, row in qualify_rows.iterrows():
+            experiment = row["experiment"]
+            if experiment not in point_coordinates:
+                continue
+            if method not in point_coordinates[experiment]:
+                continue
+
+            x_value, y_value = point_coordinates[experiment][method]
+
+            # Position text left or right based on point position
+            if x_value < method_center:
+                ha_align = "right"
+                x_offset = -6
+            else:
+                ha_align = "left"
+                x_offset = 6
+
+            ax.annotate(
+                experiment,
+                xy=(x_value, y_value),
+                xytext=(x_offset, 4),
+                textcoords="offset points",
+                ha=ha_align,
+                va="bottom",
+                fontsize=7,
+                fontweight="bold",
+                color="#1A1A1A",
+                bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.75),
+                zorder=9,
+            )
 
 
 def plot_best_pointplot(
@@ -386,6 +438,14 @@ def plot_best_pointplot(
 ):
     figure, ax = plt.subplots(figsize=(max(10, 0.6 * len(method_order)), 7.0))
 
+    # Identify experiments with high performance (>= COLOR_THRESHOLD)
+    colored_experiments = get_colored_experiment_mask(df_best, COLOR_THRESHOLD)
+
+    # Create neutral palette: vibrant for high performers, neutral for others
+    neutral_palette = create_neutral_palette(
+        experiment_order, colored_experiments, experiment_palette
+    )
+
     box_face = to_rgba("#DE9995", 0.18)
     box_edge = "#730800"
 
@@ -407,6 +467,7 @@ def plot_best_pointplot(
     lower, upper = get_pointplot_limits(metric)
     overlay_method_means(ax, df_best, lower, upper)
 
+    # Draw pointplot with neutral palette
     sns.pointplot(
         data=df_best,
         x="dFC method",
@@ -418,29 +479,36 @@ def plot_best_pointplot(
         errorbar=None,
         linestyles="",
         markers="o",
-        palette=experiment_palette,
+        palette=neutral_palette,
         ax=ax,
         zorder=6,
     )
     finalize_marker_edges(ax)
+
+    # Extract point coordinates from the pointplot
     point_coordinates = extract_pointplot_coordinates(
         ax,
         method_order,
         experiment_order,
-        experiment_palette,
+        neutral_palette,
     )
+
+    # Overlay shapes for top 3 experiments using vibrant palette
     overlay_top_experiment_shapes(
         ax,
         df_best,
         point_coordinates,
-        experiment_palette,
+        neutral_palette,
         top_experiment_shapes=TOP_EXPERIMENT_SHAPES,
     )
-    annotate_top_scoring_points(
+
+    # Annotate per-method quartile points
+    annotate_per_method_quartile(
         ax,
         df_best,
         point_coordinates,
-        top_point_label_count=TOP_POINT_LABEL_COUNT,
+        method_order,
+        score_threshold=PER_METHOD_LABEL_SCORE_THRESHOLD,
     )
 
     ax.set_xlabel("dFC method")

From 378cfe2d93fdb2dd17e4a8ac363795ce900812f9 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 18:03:06 -0400
Subject: [PATCH 358/401] minor

---
 .../helper_functions.py                       |  41 +++++-
 task_dFC/multi_dataset_analysis/ml_results.py | 129 ++++++++++++------
 2 files changed, 121 insertions(+), 49 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 2db9dc9..5f73c0c 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -13,6 +13,24 @@
 from scipy.stats import ttest_ind
 from sklearn.neighbors import NearestNeighbors
 
+# Curated palette of maximally distinct, publication-quality colors.
+# Used for coloring high-performing experiments so each gets a clearly
+# different hue even when only a few experiments are highlighted.
+_VIBRANT_DISTINCT_COLORS = [
+    "#E6194B",  # vivid red
+    "#3CB44B",  # vivid green
+    "#4363D8",  # vivid blue
+    "#F58231",  # vivid orange
+    "#911EB4",  # vivid purple
+    "#42D4F4",  # cyan
+    "#F032E6",  # magenta
+    "#008080",  # teal
+    "#9A6324",  # brown
+    "#000075",  # navy
+    "#808000",  # olive
+    "#DC143C",  # crimson
+]
+
 ###################### Publication style ######################
 
 
@@ -37,7 +55,7 @@ def setup_pub_style():
             "grid.linewidth": 0.6,
             # Figure/layout
             "figure.dpi": 150,  # on-screen
-            "savefig.dpi": 500,  # export
+            "savefig.dpi": 1000,  # export
             "savefig.bbox": "tight",
             "savefig.pad_inches": 0.04,
             # Vector export: keep text as text in PDF/SVG
@@ -171,17 +189,23 @@ def build_experiment_display_info(tasks_iterable, task_reference_order, simul_or
         used_labels[experiment_label_key] = task
         used_labels_lower.add(experiment_label_key)
 
-    colors = sns.color_palette("husl", n_colors=max(1, len(task_order)))
+    n = max(1, len(task_order))
+    colors = [
+        _VIBRANT_DISTINCT_COLORS[i % len(_VIBRANT_DISTINCT_COLORS)] for i in range(n)
+    ]
     experiment_order = [task_to_experiment[task] for task in task_order]
-    experiment_palette = {
-        experiment_label: mcolors.to_hex(color)
-        for experiment_label, color in zip(experiment_order, colors)
-    }
+    experiment_palette = dict(zip(experiment_order, colors))
 
     return task_order, task_to_experiment, experiment_order, experiment_palette
 
 
 def relabel_heatmap_rows(matrix_df, annot_df, task_reference_order, task_to_experiment):
+    def _experiment_sort_key(exp_label):
+        match = re.match(r"(?i)^\s*exp\s*[._-]?\s*(\d+)\s*$", str(exp_label))
+        if match:
+            return (0, int(match.group(1)), str(exp_label).lower())
+        return (1, float("inf"), str(exp_label).lower())
+
     row_order = get_present_task_order(matrix_df.index.tolist(), task_reference_order)
     experiment_labels = [task_to_experiment[task] for task in row_order]
 
@@ -193,6 +217,11 @@ def relabel_heatmap_rows(matrix_df, annot_df, task_reference_order, task_to_expe
         relabeled_annot = annot_df.loc[row_order].copy()
         relabeled_annot.index = experiment_labels
 
+    sorted_labels = sorted(relabeled_matrix.index.tolist(), key=_experiment_sort_key)
+    relabeled_matrix = relabeled_matrix.loc[sorted_labels]
+    if relabeled_annot is not None:
+        relabeled_annot = relabeled_annot.loc[sorted_labels]
+
     return relabeled_matrix, relabeled_annot, row_order
 
 
diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 5e4d229..5cf100e 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -37,10 +37,10 @@
     ("PLS", "SI"),
 ]
 TOP_EXPERIMENT_SHAPES = 3
-TOP_EXPERIMENT_MARKERS = ["*", "^", "D", "P", "X"]
+TOP_EXPERIMENT_MARKERS = ["*"]  # star for all top experiments
 COLOR_THRESHOLD = 60.0
 PER_METHOD_LABEL_SCORE_THRESHOLD = 55.0
-NEUTRAL_COLOR = "#555555"
+NEUTRAL_COLOR = "#AE7171"
 
 
 def parse_args():
@@ -206,6 +206,12 @@ def get_pointplot_limits(metric):
     return 0.5, 1.0
 
 
+def convert_threshold_to_score_scale(threshold, metric):
+    if metric != "SI" and threshold > 1.0:
+        return threshold / 100.0
+    return threshold
+
+
 def get_heatmap_limits(metric):
     if metric == "SI":
         return None, 1.0, 0.0
@@ -260,6 +266,18 @@ def get_colored_experiment_mask(df_best, color_threshold=COLOR_THRESHOLD):
     return set(max_scores[max_scores >= color_threshold].index)
 
 
+def get_top_experiments_by_mean(df_best, top_experiment_shapes=TOP_EXPERIMENT_SHAPES):
+    if top_experiment_shapes <= 0:
+        return []
+    return (
+        df_best.groupby("experiment", observed=True)["score"]
+        .mean()
+        .sort_values(ascending=False)
+        .head(top_experiment_shapes)
+        .index.tolist()
+    )
+
+
 def create_neutral_palette(experiment_order, colored_experiments, vibrant_palette):
     """Create palette: neutral for non-colored experiments, vibrant for colored ones."""
     palette = {}
@@ -277,11 +295,6 @@ def _as_rgb_tuple(color_value):
 
 
 def extract_pointplot_coordinates(ax, method_order, experiment_order, experiment_palette):
-    target_rgb = {
-        experiment: _as_rgb_tuple(color)
-        for experiment, color in experiment_palette.items()
-    }
-
     candidate_lines = []
     for line in ax.lines:
         x_data = np.asarray(line.get_xdata(), dtype=float)
@@ -293,24 +306,11 @@ def extract_pointplot_coordinates(ax, method_order, experiment_order, experiment
             continue
         candidate_lines.append(line)
 
-    line_color_distances = []
-    for line in candidate_lines:
-        line_rgb = _as_rgb_tuple(line.get_markerfacecolor())
-        for experiment in experiment_order:
-            dist = float(np.linalg.norm(line_rgb - target_rgb[experiment]))
-            line_color_distances.append((dist, id(line), line, experiment))
-
-    assigned_lines = {}
-    used_experiments = set()
-    used_lines = set()
-    for _, line_id, line, experiment in sorted(line_color_distances, key=lambda x: x[0]):
-        if experiment in used_experiments or line_id in used_lines:
-            continue
-        assigned_lines[experiment] = line
-        used_experiments.add(experiment)
-        used_lines.add(line_id)
-        if len(assigned_lines) == len(experiment_order):
-            break
+    assigned_lines = {
+        experiment: candidate_lines[idx]
+        for idx, experiment in enumerate(experiment_order)
+        if idx < len(candidate_lines)
+    }
 
     coordinates = {}
     for experiment, line in assigned_lines.items():
@@ -325,6 +325,33 @@ def extract_pointplot_coordinates(ax, method_order, experiment_order, experiment
     return coordinates
 
 
+def resize_colored_markers(
+    ax,
+    experiment_order,
+    colored_experiments,
+    method_order,
+    base_size=5,
+    colored_size=8,
+):
+    """Make circles for colored (high-performing) experiments slightly bigger."""
+    candidate_lines = []
+    for line in ax.lines:
+        x_data = np.asarray(line.get_xdata(), dtype=float)
+        y_data = np.asarray(line.get_ydata(), dtype=float)
+        marker = line.get_marker()
+        if marker in {None, "", "None", " "}:
+            continue
+        if x_data.size != len(method_order) or y_data.size != len(method_order):
+            continue
+        candidate_lines.append(line)
+
+    for idx, experiment in enumerate(experiment_order):
+        if idx >= len(candidate_lines):
+            break
+        size = colored_size if experiment in colored_experiments else base_size
+        candidate_lines[idx].set_markersize(size)
+
+
 def overlay_top_experiment_shapes(
     ax,
     df_best,
@@ -335,13 +362,7 @@ def overlay_top_experiment_shapes(
     if top_experiment_shapes <= 0:
         return
 
-    top_experiments = (
-        df_best.groupby("experiment", observed=True)["score"]
-        .mean()
-        .sort_values(ascending=False)
-        .head(top_experiment_shapes)
-        .index.tolist()
-    )
+    top_experiments = get_top_experiments_by_mean(df_best, top_experiment_shapes)
 
     for rank, experiment in enumerate(top_experiments):
         if experiment not in point_coordinates:
@@ -356,7 +377,7 @@ def overlay_top_experiment_shapes(
             x_vals,
             y_vals,
             marker=marker,
-            s=120,
+            s=250,
             c=shape_palette[experiment],
             edgecolors="#111111",
             linewidths=1.0,
@@ -369,12 +390,15 @@ def annotate_per_method_quartile(
     df_best,
     point_coordinates,
     method_order,
+    colored_experiments,
     score_threshold=PER_METHOD_LABEL_SCORE_THRESHOLD,
 ):
     """
     For each method, annotate points in the top quartile (>75th percentile)
     if score > threshold. Position annotation left/right based on point position.
     """
+    if not colored_experiments:
+        return
     xticks = ax.get_xticks()
     xticklabels = [t.get_text() for t in ax.get_xticklabels()]
     method_positions = {lab: xticks[i] for i, lab in enumerate(xticklabels)}
@@ -388,7 +412,8 @@ def annotate_per_method_quartile(
         quartile_threshold = np.percentile(scores, 75)
 
         qualify_rows = method_df[
-            (method_df["score"] > score_threshold)
+            method_df["experiment"].isin(colored_experiments)
+            & (method_df["score"] > score_threshold)
             & (method_df["score"] >= quartile_threshold)
         ]
 
@@ -406,18 +431,18 @@ def annotate_per_method_quartile(
             # Position text left or right based on point position
             if x_value < method_center:
                 ha_align = "right"
-                x_offset = -6
+                x_offset = -11
             else:
                 ha_align = "left"
-                x_offset = 6
+                x_offset = 11
 
             ax.annotate(
                 experiment,
                 xy=(x_value, y_value),
-                xytext=(x_offset, 4),
+                xytext=(x_offset, 0),
                 textcoords="offset points",
                 ha=ha_align,
-                va="bottom",
+                va="center",
                 fontsize=7,
                 fontweight="bold",
                 color="#1A1A1A",
@@ -436,10 +461,26 @@ def plot_best_pointplot(
     metric,
     simul_or_real,
 ):
-    figure, ax = plt.subplots(figsize=(max(10, 0.6 * len(method_order)), 7.0))
+    # Keep the original width scaling so method spacing is unchanged;
+    # reduce only the height to improve aspect ratio.
+    plot_width = max(11, 0.6 * len(method_order))
+    plot_height = 5.6
+    figure, ax = plt.subplots(figsize=(plot_width, plot_height))
+
+    color_threshold = convert_threshold_to_score_scale(COLOR_THRESHOLD, metric)
+    label_threshold = convert_threshold_to_score_scale(
+        PER_METHOD_LABEL_SCORE_THRESHOLD, metric
+    )
+
+    top_experiments = get_top_experiments_by_mean(df_best, TOP_EXPERIMENT_SHAPES)
 
-    # Identify experiments with high performance (>= COLOR_THRESHOLD)
-    colored_experiments = get_colored_experiment_mask(df_best, COLOR_THRESHOLD)
+    # SI policy: color/annotate only star experiments.
+    if metric == "SI":
+        colored_experiments = set(top_experiments)
+        label_threshold = -np.inf
+    else:
+        # Identify experiments with high performance (>= COLOR_THRESHOLD)
+        colored_experiments = get_colored_experiment_mask(df_best, color_threshold)
 
     # Create neutral palette: vibrant for high performers, neutral for others
     neutral_palette = create_neutral_palette(
@@ -484,6 +525,7 @@ def plot_best_pointplot(
         zorder=6,
     )
     finalize_marker_edges(ax)
+    resize_colored_markers(ax, experiment_order, colored_experiments, method_order)
 
     # Extract point coordinates from the pointplot
     point_coordinates = extract_pointplot_coordinates(
@@ -508,7 +550,8 @@ def plot_best_pointplot(
         df_best,
         point_coordinates,
         method_order,
-        score_threshold=PER_METHOD_LABEL_SCORE_THRESHOLD,
+        colored_experiments=colored_experiments,
+        score_threshold=label_threshold,
     )
 
     ax.set_xlabel("dFC method")
@@ -517,7 +560,7 @@ def plot_best_pointplot(
         ax.set_ylim(top=1.02)
     else:
         ax.set_ylim(0.48, 1.02)
-    ax.grid(True, axis="y", alpha=0.25)
+    ax.grid(True, axis="y", color="#FFFFFF", alpha=0.85, linewidth=1.1)
     sns.despine(ax=ax, top=True, right=True)
     plt.setp(ax.get_xticklabels(), rotation=35, ha="right")
 

From d0a9fc11a9dd55b823bca6c23c5f48fc0b3ca0f4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 18:11:46 -0400
Subject: [PATCH 359/401] minor

---
 .../helper_functions.py                       | 47 -------------------
 task_dFC/multi_dataset_analysis/ml_results.py |  5 --
 2 files changed, 52 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index 5f73c0c..aa52ee1 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -1,4 +1,3 @@
-import math
 import re
 from pathlib import Path
 
@@ -239,52 +238,6 @@ def boldify_axes(ax, xlabel=None, ylabel=None, rotate_xticks=35):
         plt.setp(ax.get_xticklabels(), fontweight="bold")
 
 
-def draw_labeled_legend_panel(
-    ax_leg,
-    label_order,
-    palette,
-    ncols=2,
-    fontsize=8,
-    markersize=5,
-    colpad=0.04,
-):
-    ax_leg.set_axis_off()
-    ax_leg.set_xlim(0, 1)
-    ax_leg.set_ylim(0, 1)
-    rows = len(label_order)
-    if rows == 0:
-        return
-
-    rows_per_col = max(1, math.ceil(rows / ncols))
-    x_cols = [0.02 + i * (1.0 / ncols) for i in range(ncols)]
-    top = 0.98
-    dy = (top - 0.06) / rows_per_col
-
-    col = 0
-    row_in_col = 0
-    for label in label_order:
-        if row_in_col >= rows_per_col:
-            col += 1
-            row_in_col = 0
-        if col >= ncols:
-            break
-        x = x_cols[col]
-        y = top - row_in_col * dy
-        color = palette.get(label, "0.4")
-        ax_leg.plot(
-            [x],
-            [y],
-            marker="o",
-            ms=markersize,
-            mfc=color,
-            mec="#222222",
-            mew=0.8,
-            ls="None",
-        )
-        ax_leg.text(x + colpad, y, label, fontsize=fontsize, ha="left", va="center")
-        row_in_col += 1
-
-
 def mean_ci_boot(y, n_boot=3000, ci=95, rng=None):
     y = np.asarray(y, float)
     y = y[~np.isnan(y)]
diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 5cf100e..55ff40f 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -289,11 +289,6 @@ def create_neutral_palette(experiment_order, colored_experiments, vibrant_palett
     return palette
 
 
-def _as_rgb_tuple(color_value):
-    rgba = to_rgba(color_value)
-    return np.array([rgba[0], rgba[1], rgba[2]], dtype=float)
-
-
 def extract_pointplot_coordinates(ax, method_order, experiment_order, experiment_palette):
     candidate_lines = []
     for line in ax.lines:

From 57f2068d7361a18e0f0497cb788cd1fe09200a5f Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 18:18:20 -0400
Subject: [PATCH 360/401] minor

---
 task_dFC/multi_dataset_analysis/ml_results.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 55ff40f..2312743 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -40,7 +40,7 @@
 TOP_EXPERIMENT_MARKERS = ["*"]  # star for all top experiments
 COLOR_THRESHOLD = 60.0
 PER_METHOD_LABEL_SCORE_THRESHOLD = 55.0
-NEUTRAL_COLOR = "#AE7171"
+NEUTRAL_COLOR = "#D49B9B"
 
 
 def parse_args():
@@ -426,10 +426,10 @@ def annotate_per_method_quartile(
             # Position text left or right based on point position
             if x_value < method_center:
                 ha_align = "right"
-                x_offset = -11
+                x_offset = -10
             else:
                 ha_align = "left"
-                x_offset = 11
+                x_offset = 10
 
             ax.annotate(
                 experiment,

From a82cb6c46a52033d5680560f760b8856f8695327 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 20:21:41 -0400
Subject: [PATCH 361/401] update cohensd script

---
 task_dFC/multi_dataset_analysis/cohensd.py | 318 ++++++++-------------
 1 file changed, 125 insertions(+), 193 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index f589a2c..468bdb4 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -52,12 +52,21 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
+    # the dictionary to build the dataframe for visualization of Cohen's d across tasks
     CohensD_across_task = {
         "task": [],
         "d_values": [],
         "dataset": [],
         "ROI": [],
     }
+    # the dictionary to be used for the correlation with ML performance
+    CohensD_ML = {
+        "task": [],
+        "run": [],
+        "dataset": [],
+        "CohensD_max": [],
+        "CohensD_mean": [],
+    }
     for dataset in DATASETS:
         print(f"Processing dataset: {dataset}")
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
@@ -89,95 +98,129 @@
                 print(f"Skipping task {task} as it's not in the inclusion list.")
                 continue
             d_values_all = []
-            for session in SESSIONS:
-                print(f"Processing task: {task}")
-                SUBJECTS = find_available_subjects(
-                    dFC_root=dFC_root,
-                    task=task,
-                    dFC_id=None,
-                    session=session,
-                )
+            session = SESSIONS[
+                0
+            ]  # for now, only use the first session if multiple are present
+            print(f"Processing task: {task}")
+            SUBJECTS = find_available_subjects(
+                dFC_root=dFC_root,
+                task=task,
+                dFC_id=None,
+                session=session,
+            )
+            excluded_subjects = []
+            for run in RUNS[task]:
+                d_values_run = []
                 for subj in SUBJECTS:
-                    for run in RUNS[task]:
-                        try:
-                            task_data = load_task_data(
-                                roi_root=roi_root,
-                                subj=subj,
-                                task=task,
-                                run=run,
-                                session=session,
-                            )
-                        except:
-                            continue
-
-                        if run is None:
-                            if session is None:
-                                BOLD_file_name = "{subj_id}_{task}_time-series.npy"
-                            else:
-                                BOLD_file_name = (
-                                    "{subj_id}_{session}_{task}_time-series.npy"
-                                )
-                        else:
-                            if session is None:
-                                BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
-                            else:
-                                BOLD_file_name = (
-                                    "{subj_id}_{session}_{task}_{run}_time-series.npy"
-                                )
-                        try:
-                            BOLD = data_loader.load_TS(
-                                data_root=roi_root,
-                                file_name=BOLD_file_name,
-                                subj_id2load=subj,
-                                task=task,
-                                session=session,
-                                run=run,
-                            )
-                        except Exception as e:
-                            print(f"Error loading BOLD data: {e}")
-                            continue
-                        BOLD_data = BOLD.data  # np.ndarray (n_ROIs, n_TRs)
-
-                        Fs_task = task_data["Fs_task"]
-                        TR_task = 1 / Fs_task
-
-                        TR_array = np.arange(0, BOLD_data.shape[1])
-                        task_presence, indices = extract_task_presence(
-                            event_labels=task_data["event_labels"],
-                            TR_task=TR_task,
-                            TR_mri=task_data["TR_mri"],
-                            binary=True,
-                            binarizing_method="GMM",
-                            no_hrf=False,
-                            TR_array=TR_array,
+                    try:
+                        task_data = load_task_data(
+                            roi_root=roi_root,
+                            subj=subj,
+                            task=task,
+                            run=run,
+                            session=session,
                         )
+                    except:
+                        excluded_subjects.append(subj)
+                        continue
 
-                        # if n_TRs do not match, align them
-                        if BOLD_data.shape[1] != task_presence.shape[0]:
-                            print(
-                                f"Before alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
-                            )
-                            min_TRs = min(BOLD_data.shape[1], task_presence.shape[0])
-                            task_presence = task_presence[:min_TRs]
-                            BOLD_data = BOLD_data[:, :min_TRs]
-                            print(
-                                f"After alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                    if run is None:
+                        if session is None:
+                            BOLD_file_name = "{subj_id}_{task}_time-series.npy"
+                        else:
+                            BOLD_file_name = "{subj_id}_{session}_{task}_time-series.npy"
+                    else:
+                        if session is None:
+                            BOLD_file_name = "{subj_id}_{task}_{run}_time-series.npy"
+                        else:
+                            BOLD_file_name = (
+                                "{subj_id}_{session}_{task}_{run}_time-series.npy"
                             )
-                            # also adjust indices
-                            indices = [i for i in indices if i < min_TRs]
-                        task_presence = task_presence[indices]  # (n_TRs,)
-                        BOLD_data = BOLD_data[:, indices]  # (n_ROIs, n_TRs)
-
-                        assert BOLD_data.shape[1] == task_presence.shape[0]
+                    try:
+                        BOLD = data_loader.load_TS(
+                            data_root=roi_root,
+                            file_name=BOLD_file_name,
+                            subj_id2load=subj,
+                            task=task,
+                            session=session,
+                            run=run,
+                        )
+                    except Exception as e:
+                        print(f"Error loading BOLD data: {e}")
+                        excluded_subjects.append(subj)
+                        continue
+                    BOLD_data = BOLD.data  # np.ndarray (n_ROIs, n_TRs)
+
+                    Fs_task = task_data["Fs_task"]
+                    TR_task = 1 / Fs_task
+
+                    TR_array = np.arange(0, BOLD_data.shape[1])
+                    task_presence, indices = extract_task_presence(
+                        event_labels=task_data["event_labels"],
+                        TR_task=TR_task,
+                        TR_mri=task_data["TR_mri"],
+                        binary=True,
+                        binarizing_method="GMM",
+                        no_hrf=False,
+                        TR_array=TR_array,
+                    )
+
+                    # if n_TRs do not match, align them
+                    if BOLD_data.shape[1] != task_presence.shape[0]:
+                        print(
+                            f"Before alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                        )
+                        min_TRs = min(BOLD_data.shape[1], task_presence.shape[0])
+                        task_presence = task_presence[:min_TRs]
+                        BOLD_data = BOLD_data[:, :min_TRs]
+                        print(
+                            f"After alignment, shape of task_presence: {task_presence.shape}, shape of BOLD_data: {BOLD_data.shape}"
+                        )
+                        # also adjust indices
+                        indices = [i for i in indices if i < min_TRs]
+                    task_presence = task_presence[indices]  # (n_TRs,)
+                    BOLD_data = BOLD_data[:, indices]  # (n_ROIs, n_TRs)
+
+                    assert BOLD_data.shape[1] == task_presence.shape[0]
+
+                    cohen_d = cohen_d_bold(X=BOLD_data.T, y=task_presence)  # (n_ROIs,)
+                    d_values_run.append(cohen_d)
+
+                d_values_run = np.array(d_values_run)  # (n_subjects, n_ROIs)
+                assert (
+                    d_values_run.shape[1] == BOLD_data.shape[0]
+                ), f"Expected number of ROIs in d_values_run ({d_values_run.shape[1]}) to match BOLD_data ({BOLD_data.shape[0]})"
+                assert d_values_run.shape[0] == len(SUBJECTS) - len(
+                    set(excluded_subjects)
+                ), f"Expected number of subjects in d_values_run ({d_values_run.shape[0]}) to match n_subjects ({len(SUBJECTS) - len(set(excluded_subjects))})"
+
+                CohensD_ML["task"].append(task)
+                CohensD_ML["run"].append(run)
+                CohensD_ML["dataset"].append(dataset)
+                # MAX |d| across ROIs for this run after averaging across subjects
+                CohensD_ML["CohensD_max"].append(
+                    np.nanmax(np.abs(np.nanmean(d_values_run, axis=0)))
+                )
+                # MEAN |d| across ROIs for this run after averaging across subjects
+                CohensD_ML["CohensD_mean"].append(
+                    np.nanmean(np.abs(np.nanmean(d_values_run, axis=0)))
+                )
 
-                        cohen_d = cohen_d_bold(X=BOLD_data.T, y=task_presence)
-                        d_values_all.append(cohen_d)
+                d_values_all.append(d_values_run)
 
             if len(d_values_all) == 0:
                 print(f"No data found for task {task} in dataset {dataset}. Skipping.")
                 continue
-            d_values_all = np.array(d_values_all)  # (n_subjectsxrunsxsessions, n_ROIs)
-            avg_d_values = np.nanmean(d_values_all, axis=0)  # (n_ROIs,)
+            d_values_all = np.array(d_values_all)  # (runs, n_subjects, n_ROIs)
+            assert d_values_all.shape[1] == len(SUBJECTS) - len(
+                set(excluded_subjects)
+            ), f"Expected number of subjects in d_values_all ({d_values_all.shape[1]}) to match n_subjects ({len(SUBJECTS) - len(set(excluded_subjects))})"
+            assert d_values_all.shape[0] == len(
+                RUNS[task]
+            ), f"Expected number of runs in d_values_all ({d_values_all.shape[0]}) to match RUNS for task {task} ({len(RUNS[task])})"
+            avg_d_values = np.nanmean(
+                np.nanmean(d_values_all, axis=0), axis=0
+            )  # (n_ROIs,)
             CohensD_across_task["d_values"].extend(avg_d_values)
             CohensD_across_task["task"].extend([task] * len(avg_d_values))
             CohensD_across_task["dataset"].extend([dataset] * len(avg_d_values))
@@ -270,119 +313,8 @@
 
                 plt.close()
 
-    # --- Across-task correlation with ML performance (ABSOLUTE Cohen's d) ---
-    # Load ALL_ML_SCORES
-    ALL_ML_SCORES = np.load(
-        f"{multi_dataset_info['output_root']}/ML_results/ALL_ML_SCORES_{simul_or_real}.npy",
-        allow_pickle=True,
-    ).item()
-
-    embedding = "LE"
-    metric = "SVM balanced accuracy"
-    GROUP = "test"
-
-    # Build dataframe if not already done
-    DF = pd.DataFrame.from_dict(CohensD_across_task)
-
-    # Use absolute Cohen's d
-    DF["abs_d"] = DF["d_values"].abs()
-
-    # Choose an order (sort tasks by their MAX |d| to align with Fig. 2)
-    max_abs_per_task = (
-        DF.groupby("task")["abs_d"]
-        .max()
-        .sort_values(ascending=False)
-        .reset_index(name="abs_max")
-    )
-
-    df = pd.DataFrame.from_dict(ALL_ML_SCORES)
-    df = df[df["task"].isin(TASKS_to_include)]
-    df = df[(df["embedding"] == embedding) & (df["group"] == GROUP)]
-
-    # alphabetical method order
-    method_order = sorted(df["dFC method"].unique(), key=lambda s: s.lower())
-    df["dFC method"] = pd.Categorical(
-        df["dFC method"], categories=method_order, ordered=True
-    )
-
-    # ===== build BEST and ACROSS tables =====
-    counts_task = df.groupby("task")["run"].nunique()
-    multi_tasks = counts_task[counts_task > 1].index
-    df_multi = df[
-        df["task"].isin(multi_tasks)
-    ]  # <- use this dataframe for ACROSS figures
-
-    # BEST: one row per (task, method) with the winning run kept
-    df_best = (
-        df.sort_values(["task", "dFC method", metric], ascending=[True, True, False])
-        .drop_duplicates(subset=["task", "dFC method"], keep="first")
-        .rename(columns={metric: "score"})
-    )
-
-    # keep only the task and score columns
-    df_best = df_best[["task", "score"]]
-
-    # average over dFC methods and make a new dataframe
-    df_best = df_best.groupby("task").agg({"score": "mean"}).reset_index()
-    # find the correlation between max_abs_per_task["abs_max"] and df_best['score']
-    merged = pd.merge(max_abs_per_task, df_best, on="task")
-
-    # task="task-ppalocalizer" is an outlier, show it as a different color and exclude it from the correlation calculation
-    outlier = merged[merged["task"] == "task-ppalocalizer"]
-    merged = merged[merged["task"] != "task-ppalocalizer"]
-    plt.style.use("seaborn-v0_8-paper")
-    sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})
-    sns.set_style("darkgrid")
-    plt.figure(figsize=(10, 8))
-    sns.scatterplot(
-        x="abs_max", y="score", data=merged, s=60, edgecolor="k", label="Task Paradigms"
-    )
-    sns.scatterplot(
-        x="abs_max",
-        y="score",
-        data=outlier,
-        color="orange",
-        s=80,
-        edgecolor="k",
-        label="Outlier: task-ppalocalizer",
-    )
-
-    # fit and plot regression line
-    sns.regplot(
-        x="abs_max",
-        y="score",
-        data=merged,
-        scatter=False,
-        color="red",
-        line_kws={"label": "Best fit"},
-    )
-
-    plt.xlabel("Max |Cohen's d| per Task", fontweight="bold", fontsize=14)
-    plt.ylabel("SVM Balanced Accuracy", fontweight="bold", fontsize=14)
-    # plt.legend(fontsize=12)
-    correlation = merged["abs_max"].corr(merged["score"])
-    plt.text(
-        0.05,
-        0.95,
-        f"correlation  r = {correlation:.2f}",
-        transform=plt.gca().transAxes,
-        fontsize=17,
-        fontweight="bold",
-        verticalalignment="top",
-    )
-
-    plt.xticks(fontweight="bold", fontsize=12)
-    plt.yticks(fontweight="bold", fontsize=12)
-    plt.grid(True)
-    plt.tight_layout()
-    plt.savefig(
-        f"{output_root}/CohensdCorr.png",
-        dpi=150,
-        bbox_inches="tight",
-        pad_inches=0.2,
-        format="png",
-    )
-    plt.close()
+    # Save the Cohen's d values for comparison with ML performance
+    np.save(f"{output_root}/CohensD_ML_{simul_or_real}.npy", CohensD_ML)
 
     # --- Across-task visualizations (ABSOLUTE Cohen's d) ---
     sns.set_context("paper", font_scale=1.0, rc={"lines.linewidth": 1.2})

From ccb695dbec53bef2afef77b82b56fa64b1653784 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 11 Mar 2026 20:59:42 -0400
Subject: [PATCH 362/401] bug fix

---
 task_dFC/multi_dataset_analysis/cohensd.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index 468bdb4..526054f 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -185,6 +185,7 @@
 
                     cohen_d = cohen_d_bold(X=BOLD_data.T, y=task_presence)  # (n_ROIs,)
                     d_values_run.append(cohen_d)
+                    d_values_all.append(cohen_d)
 
                 d_values_run = np.array(d_values_run)  # (n_subjects, n_ROIs)
                 assert (
@@ -206,21 +207,12 @@
                     np.nanmean(np.abs(np.nanmean(d_values_run, axis=0)))
                 )
 
-                d_values_all.append(d_values_run)
-
             if len(d_values_all) == 0:
                 print(f"No data found for task {task} in dataset {dataset}. Skipping.")
                 continue
-            d_values_all = np.array(d_values_all)  # (runs, n_subjects, n_ROIs)
-            assert d_values_all.shape[1] == len(SUBJECTS) - len(
-                set(excluded_subjects)
-            ), f"Expected number of subjects in d_values_all ({d_values_all.shape[1]}) to match n_subjects ({len(SUBJECTS) - len(set(excluded_subjects))})"
-            assert d_values_all.shape[0] == len(
-                RUNS[task]
-            ), f"Expected number of runs in d_values_all ({d_values_all.shape[0]}) to match RUNS for task {task} ({len(RUNS[task])})"
-            avg_d_values = np.nanmean(
-                np.nanmean(d_values_all, axis=0), axis=0
-            )  # (n_ROIs,)
+            d_values_all = np.array(d_values_all)  # (runs x n_subjects, n_ROIs)
+
+            avg_d_values = np.nanmean(d_values_all, axis=0)  # (n_ROIs,)
             CohensD_across_task["d_values"].extend(avg_d_values)
             CohensD_across_task["task"].extend([task] * len(avg_d_values))
             CohensD_across_task["dataset"].extend([dataset] * len(avg_d_values))

From 5ae1436c2fa7d15c7b7e261bd070f295901eff3c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 12 Mar 2026 10:55:13 -0400
Subject: [PATCH 363/401] update task_timing_stats

---
 .../task_timing_stats.py                      | 75 +++++++++++++------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
index 9158546..4082176 100644
--- a/task_dFC/multi_dataset_analysis/task_timing_stats.py
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -76,6 +76,17 @@
     transition_freq_all = {}
     rest_durations_all = {}
     task_durations_all = {}
+    DATA = {
+        "task": [],
+        "run": [],
+        "dataset": [],
+        "task_ratio_avg": [],
+        "transition_freq_avg": [],
+        "rest_durations_median": [],
+        "task_durations_median": [],
+        "rest_durations_iqr": [],
+        "task_durations_iqr": [],
+    }
     for dataset in DATASETS:
 
         print(f"Processing dataset: {dataset}")
@@ -103,11 +114,17 @@
         if RUNS is None:
             RUNS = {task: [None] for task in TASKS}
 
-        for session in SESSIONS:
+        for session in SESSIONS[:1]:  # process only the first session if multiple exist
             for task_id, task in enumerate(TASKS):
                 if not task in TASKS_to_include:
                     continue
                 for run in RUNS[task]:
+
+                    task_ratio_run = []
+                    transition_freq_run = []
+                    rest_durations_run = []
+                    task_durations_run = []
+
                     SUBJECTS = find_subj_list(roi_root)
                     # print(f"Number of subjects: {len(SUBJECTS)}")
 
@@ -146,27 +163,41 @@
                             event_labels, TR_mri=1 / task_data["Fs_task"]
                         )
 
-                        if not task in task_ratio_all:
-                            task_ratio_all[task] = []
-                        if not task in transition_freq_all:
-                            transition_freq_all[task] = []
-                        if not task in rest_durations_all:
-                            rest_durations_all[task] = []
-                        if not task in task_durations_all:
-                            task_durations_all[task] = []
-                        task_ratio_all[task].append(relative_task_on)
-                        transition_freq_all[task].append(relative_transition_freq)
-                        # rest_durations and task_durations are lists
-                        rest_durations_all[task].extend(rest_durations)
-                        task_durations_all[task].extend(task_durations)
-
-    DATA = {
-        "task_ratio_all": task_ratio_all,
-        "transition_freq_all": transition_freq_all,
-        "rest_durations_all": rest_durations_all,
-        "task_durations_all": task_durations_all,
-    }
-    # np.save(f"task_timing_stats_{simul_or_real}.npy", DATA)
+                        task_ratio_run.append(relative_task_on)
+                        transition_freq_run.append(relative_transition_freq)
+                        rest_durations_run.extend(rest_durations)
+                        task_durations_run.extend(task_durations)
+
+                    # Aggregate stats across runs for this task and store in the all-run dictionaries for later plotting
+                    if not task in task_ratio_all:
+                        task_ratio_all[task] = []
+                    if not task in transition_freq_all:
+                        transition_freq_all[task] = []
+                    if not task in rest_durations_all:
+                        rest_durations_all[task] = []
+                    if not task in task_durations_all:
+                        task_durations_all[task] = []
+                    task_ratio_all[task].extend(task_ratio_run)
+                    transition_freq_all[task].extend(transition_freq_run)
+                    rest_durations_all[task].extend(rest_durations_run)
+                    task_durations_all[task].extend(task_durations_run)
+
+                    # Aggregate run-level stats for this task and store in DATA for potential further analysis
+                    DATA["task"].append(task)
+                    DATA["run"].append(run)
+                    DATA["dataset"].append(dataset)
+                    DATA["task_ratio_avg"].append(np.nanmean(task_ratio_run))
+                    DATA["transition_freq_avg"].append(np.nanmean(transition_freq_run))
+                    DATA["rest_durations_median"].append(np.nanmedian(rest_durations_run))
+                    DATA["task_durations_median"].append(np.nanmedian(task_durations_run))
+                    q75_rest, q25_rest = np.percentile(rest_durations_run, [75, 25])
+                    iqr_rest = q75_rest - q25_rest
+                    q75_task, q25_task = np.percentile(task_durations_run, [75, 25])
+                    iqr_task = q75_task - q25_task
+                    DATA["rest_durations_iqr"].append(iqr_rest)
+                    DATA["task_durations_iqr"].append(iqr_task)
+
+    np.save(f"{output_root}/task_timing_stats_{simul_or_real}.npy", DATA)
 
     # =========================
     # Paper-quality seaborn plots (patched)

From a0cff35e2de1696c1ec66e3729d6824442023341 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 19:47:46 -0400
Subject: [PATCH 364/401] add performance_factor.py

---
 .../helper_functions.py                       |  79 ++++
 .../performance_factor.py                     | 337 ++++++++++++++++++
 2 files changed, 416 insertions(+)
 create mode 100644 task_dFC/multi_dataset_analysis/performance_factor.py

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index aa52ee1..c5e3f94 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -73,6 +73,85 @@ def savefig_pub(path_png_or_pdf: str):
     #     plt.savefig(p.with_suffix(".pdf"))
 
 
+###################### RDoC ######################
+
+RDoC_MAP = {
+    "real": {
+        # --- Cognitive-Atlas–aligned domains (order on paper) ---
+        "DOMAIN_ORDER": [
+            "Arousal & Regulatory Systems",
+            "Cognitive Systems",
+            "Negative Valence System",
+            "Positive Valence System",
+            "Sensorimotor Systems",
+        ],
+        # --- Map canonical task codes -> domain ---
+        "TASK2DOMAIN": {
+            # Language & Regulatory Systems
+            "emotionregulation": "Arousal & Regulatory Systems",
+            # Cognitive Systems
+            "audsem": "Cognitive Systems",
+            "visrhyme": "Cognitive Systems",
+            "vissem": "Cognitive Systems",
+            "visspell": "Cognitive Systems",
+            "arithmetic": "Cognitive Systems",
+            "stroop": "Cognitive Systems",
+            "cuedts": "Cognitive Systems",
+            "axcpt": "Cognitive Systems",
+            "matching": "Cognitive Systems",
+            "stern": "Cognitive Systems",
+            "st": "Cognitive Systems",
+            "vswm": "Cognitive Systems",
+            "expo": "Cognitive Systems",
+            "recall": "Cognitive Systems",
+            "feedback": "Cognitive Systems",
+            "ppalocalizer": "Cognitive Systems",
+            "localiser": "Cognitive Systems",
+            "localizer": "Cognitive Systems",
+            # Positive Valence System
+            "fribbids": "Positive Valence System",
+            "risk": "Positive Valence System",
+            "itc": "Positive Valence System",
+            # Negative Valence System
+            "fearlearning": "Negative Valence System",
+            "paingen": "Negative Valence System",
+            # Sensorimotor
+            "motor": "Sensorimotor Systems",
+            "execution": "Sensorimotor Systems",
+            "imagery": "Sensorimotor Systems",
+            "ihg": "Sensorimotor Systems",
+        },
+    },
+    "simulated": {
+        # --- Categories of simulated task paradigms ---
+        "DOMAIN_ORDER": [
+            "Simulated Periodic",
+            "Strong Performance on Real Data",
+            "Weak Performance on Real Data",
+        ],
+        # --- Map task codes -> category ---
+        "TASK2DOMAIN": {
+            # Simulated Periodic
+            "lowfreqlongrest": "Simulated Periodic",
+            "lowfreqshortrest": "Simulated Periodic",
+            "lowfreqshorttask": "Simulated Periodic",
+            # Optimal Paradigm Design, Strong Performance on Real Data
+            "axcpt": "Strong Performance on Real Data",
+            "stern": "Strong Performance on Real Data",
+            "cuedts": "Strong Performance on Real Data",
+            # Optimal Paradigm Design, Weak Performance on Real Data
+            "execution": "Weak Performance on Real Data",
+            "imagery": "Weak Performance on Real Data",
+            "localizer": "Weak Performance on Real Data",
+            "ppalocalizer": "Weak Performance on Real Data",
+            # Sub-Optimal Paradigm Design, Weak Performance on Real Data
+            "itc": "Weak Performance on Real Data",
+            "stroop": "Weak Performance on Real Data",
+            "risk": "Weak Performance on Real Data",
+        },
+    },
+}
+
 ###################### ml_results ######################
 
 
diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
new file mode 100644
index 0000000..206dfc9
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -0,0 +1,337 @@
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import RDoC_MAP, canon_task  # pyright: ignore[reportMissingImports]
+
+LEVEL = "group_lvl"
+GROUP = "test"
+
+CLASSIFIER_METRIC_MAP = {
+    "Logistic regression": "Logistic regression balanced accuracy",
+    "SVM": "SVM balanced accuracy",
+}
+
+TIMING_FEATURES = [
+    "task_ratio_avg",
+    "transition_freq_avg",
+    "rest_durations_median",
+    "task_durations_median",
+    "rest_durations_iqr",
+    "task_durations_iqr",
+]
+
+COHEN_FEATURES = [
+    "CohensD_max",
+    "CohensD_mean",
+]
+
+
+def parse_args():
+    helptext = """
+    Build a unified run-level dataframe linking ML performance to task factors.
+    """
+    parser = argparse.ArgumentParser(description=helptext)
+    parser.add_argument(
+        "--multi_dataset_info",
+        type=str,
+        required=True,
+        help="path to multi-dataset info file",
+    )
+    parser.add_argument(
+        "--simul_or_real",
+        type=str,
+        required=True,
+        choices=["simulated", "real"],
+        help="Specify 'simulated' or 'real' data",
+    )
+    return parser.parse_args()
+
+
+def read_json(json_file):
+    with open(json_file, "r") as file_obj:
+        return json.load(file_obj)
+
+
+def load_npy_dict(path, label):
+    assert os.path.exists(path), f"{label} file does not exist: {path}"
+    loaded = np.load(path, allow_pickle=True)
+    if isinstance(loaded, np.ndarray):
+        loaded = loaded.item()
+    assert isinstance(loaded, dict), f"{label} must be a dictionary. Got {type(loaded)}"
+    return loaded
+
+
+def assert_required_keys(data_dict, required_keys, label):
+    missing = [key for key in required_keys if key not in data_dict]
+    assert not missing, f"Missing required keys in {label}: {missing}"
+
+
+def dict_to_df(data_dict, label):
+    lengths = {key: len(value) for key, value in data_dict.items()}
+    unique_lengths = set(lengths.values())
+    assert len(unique_lengths) == 1, (
+        f"Inconsistent column lengths in {label}: {lengths}. "
+        "All arrays/lists must have equal length."
+    )
+    return pd.DataFrame.from_dict(data_dict)
+
+
+def normalize_run(value):
+    if value is None:
+        return "none"
+    if isinstance(value, float) and np.isnan(value):
+        return "none"
+    return str(value).strip().lower()
+
+
+def add_join_keys(df):
+    assert "task" in df.columns, "Expected column 'task'"
+    assert "run" in df.columns, "Expected column 'run'"
+    df = df.copy()
+    df["task_key"] = df["task"].astype(str).map(canon_task)
+    df["run_key"] = df["run"].map(normalize_run)
+    assert (df["task_key"].str.len() > 0).all(), "Found empty normalized task key"
+    return df
+
+
+def get_paths(multi_dataset_info, simul_or_real):
+    output_root = multi_dataset_info["output_root"]
+    return {
+        "ml": f"{output_root}/ML_results/ALL_ML_SCORES_{simul_or_real}.npy",
+        "timing": f"{output_root}/task_timing_stats/{simul_or_real}/task_timing_stats_{simul_or_real}.npy",
+        "cohensd": f"{output_root}/CohensD/{simul_or_real}/CohensD_ML_{simul_or_real}.npy",
+        "out_dir": f"{output_root}/performance_factor/{simul_or_real}",
+    }
+
+
+def prepare_ml_df(ml_scores_all):
+    assert LEVEL in ml_scores_all, f"Expected top-level key '{LEVEL}' in ML scores"
+    ml_scores = ml_scores_all[LEVEL]
+
+    required_keys = [
+        "task",
+        "run",
+        "embedding",
+        "dFC method",
+        "group",
+        *CLASSIFIER_METRIC_MAP.values(),
+    ]
+    assert_required_keys(ml_scores, required_keys, "ALL_ML_SCORES")
+
+    df_ml_wide = dict_to_df(ml_scores, "ALL_ML_SCORES[group_lvl]")
+    df_ml_wide = df_ml_wide[df_ml_wide["group"] == GROUP].copy()
+    assert not df_ml_wide.empty, f"No ML rows found for group='{GROUP}'"
+
+    if "dataset" in df_ml_wide.columns:
+        id_cols = ["dataset", "task", "run", "embedding", "dFC method", "group"]
+    else:
+        id_cols = ["task", "run", "embedding", "dFC method", "group"]
+
+    classifier_frames = []
+    for classifier, metric_key in CLASSIFIER_METRIC_MAP.items():
+        frame = df_ml_wide[id_cols + [metric_key]].copy()
+        frame["classifier model"] = classifier
+        frame = frame.rename(columns={metric_key: "classification_balanced_accuracy"})
+        classifier_frames.append(frame)
+
+    df_ml = pd.concat(classifier_frames, ignore_index=True)
+    df_ml = df_ml.rename(columns={"dFC method": "dFC assessment method"})
+
+    score = df_ml["classification_balanced_accuracy"].astype(float)
+    assert np.isfinite(score).all(), "ML performance contains NaN/Inf values"
+    assert ((score >= 0.0) & (score <= 1.0)).all(), (
+        "Expected balanced accuracy in [0, 1]. "
+        f"Observed min={score.min()}, max={score.max()}"
+    )
+
+    return add_join_keys(df_ml)
+
+
+def prepare_timing_df(timing_dict):
+    required_keys = ["task", "run", *TIMING_FEATURES]
+    assert_required_keys(timing_dict, required_keys, "task_timing_stats")
+    df_timing = dict_to_df(timing_dict, "task_timing_stats")
+
+    keep_cols = ["task", "run", *TIMING_FEATURES]
+    if "dataset" in df_timing.columns:
+        keep_cols = ["dataset", *keep_cols]
+    df_timing = df_timing[keep_cols].copy()
+
+    for col in TIMING_FEATURES:
+        values = df_timing[col].astype(float)
+        assert np.isfinite(values).all(), f"Timing feature '{col}' contains NaN/Inf"
+
+    return add_join_keys(df_timing)
+
+
+def prepare_cohensd_df(cohensd_dict):
+    required_keys = ["task", "run", *COHEN_FEATURES]
+    assert_required_keys(cohensd_dict, required_keys, "CohensD_ML")
+    df_cohensd = dict_to_df(cohensd_dict, "CohensD_ML")
+
+    keep_cols = ["task", "run", *COHEN_FEATURES]
+    if "dataset" in df_cohensd.columns:
+        keep_cols = ["dataset", *keep_cols]
+    df_cohensd = df_cohensd[keep_cols].copy()
+
+    for col in COHEN_FEATURES:
+        values = df_cohensd[col].astype(float)
+        assert np.isfinite(values).all(), f"Cohen's D feature '{col}' contains NaN/Inf"
+
+    return add_join_keys(df_cohensd)
+
+
+def choose_join_keys(df_ml, df_timing, df_cohensd):
+    has_dataset_everywhere = all(
+        "dataset" in df.columns for df in [df_ml, df_timing, df_cohensd]
+    )
+
+    base_keys = ["task_key", "run_key"]
+    dataset_keys = ["dataset", *base_keys]
+
+    timing_dupes_base = df_timing.duplicated(subset=base_keys).sum()
+    cohensd_dupes_base = df_cohensd.duplicated(subset=base_keys).sum()
+
+    if timing_dupes_base == 0 and cohensd_dupes_base == 0:
+        return base_keys
+
+    if has_dataset_everywhere:
+        timing_dupes_dataset = df_timing.duplicated(subset=dataset_keys).sum()
+        cohensd_dupes_dataset = df_cohensd.duplicated(subset=dataset_keys).sum()
+        assert timing_dupes_dataset == 0, (
+            "task_timing_stats still has duplicate rows per dataset/task/run after "
+            f"normalization. duplicate_count={timing_dupes_dataset}"
+        )
+        assert cohensd_dupes_dataset == 0, (
+            "CohensD_ML still has duplicate rows per dataset/task/run after "
+            f"normalization. duplicate_count={cohensd_dupes_dataset}"
+        )
+        return dataset_keys
+
+    raise AssertionError(
+        "Ambiguous join on task/run (duplicates found), and dataset is not available "
+        "in all sources to disambiguate."
+    )
+
+
+def merge_with_checks(df_ml, df_timing, df_cohensd, join_keys):
+    timing_cols = join_keys + TIMING_FEATURES
+    cohensd_cols = join_keys + COHEN_FEATURES
+
+    df_merged = df_ml.merge(
+        df_timing[timing_cols],
+        on=join_keys,
+        how="left",
+        validate="many_to_one",
+        indicator="timing_merge",
+    )
+    timing_unmatched = (df_merged["timing_merge"] != "both").sum()
+    assert (
+        timing_unmatched == 0
+    ), f"Could not match timing stats for {timing_unmatched} ML rows using keys {join_keys}"
+    df_merged = df_merged.drop(columns=["timing_merge"])
+
+    df_merged = df_merged.merge(
+        df_cohensd[cohensd_cols],
+        on=join_keys,
+        how="left",
+        validate="many_to_one",
+        indicator="cohensd_merge",
+    )
+    cohensd_unmatched = (df_merged["cohensd_merge"] != "both").sum()
+    assert (
+        cohensd_unmatched == 0
+    ), f"Could not match Cohen's D stats for {cohensd_unmatched} ML rows using keys {join_keys}"
+    df_merged = df_merged.drop(columns=["cohensd_merge"])
+
+    return df_merged
+
+
+def add_rdoc(df, simul_or_real):
+    task_to_domain = RDoC_MAP[simul_or_real]["TASK2DOMAIN"]
+    df = df.copy()
+    df["RDoC"] = df["task_key"].map(task_to_domain)
+
+    missing_mask = df["RDoC"].isna()
+    if missing_mask.any():
+        missing_tasks = sorted(df.loc[missing_mask, "task"].astype(str).unique())
+        raise AssertionError(
+            "Missing RDoC mapping for tasks (after canonicalization): "
+            f"{missing_tasks}. Update helper_functions.RDoC_MAP if needed."
+        )
+
+    return df
+
+
+def finalize_columns(df):
+    cols = []
+    if "dataset" in df.columns:
+        cols.append("dataset")
+    cols += [
+        "task",
+        "run",
+        "RDoC",
+        *TIMING_FEATURES,
+        *COHEN_FEATURES,
+        "dFC assessment method",
+        "classifier model",
+        "embedding",
+        "classification_balanced_accuracy",
+    ]
+
+    missing = [col for col in cols if col not in df.columns]
+    assert not missing, f"Missing expected final columns: {missing}"
+
+    out = df[cols].copy()
+    sort_cols = ["task", "run", "dFC assessment method", "classifier model", "embedding"]
+    if "dataset" in out.columns:
+        sort_cols = ["dataset", *sort_cols]
+    out = out.sort_values(sort_cols).reset_index(drop=True)
+    return out
+
+
+def save_outputs(df, out_dir, simul_or_real):
+    os.makedirs(out_dir, exist_ok=True)
+    csv_path = f"{out_dir}/performance_factor_{simul_or_real}.csv"
+    pkl_path = f"{out_dir}/performance_factor_{simul_or_real}.pkl"
+    df.to_csv(csv_path, index=False)
+    df.to_pickle(pkl_path)
+    return csv_path, pkl_path
+
+
+def main():
+    args = parse_args()
+
+    multi_dataset_info = read_json(args.multi_dataset_info)
+    paths = get_paths(multi_dataset_info, args.simul_or_real)
+
+    ml_scores_all = load_npy_dict(paths["ml"], "ALL_ML_SCORES")
+    timing_dict = load_npy_dict(paths["timing"], "task_timing_stats")
+    cohensd_dict = load_npy_dict(paths["cohensd"], "CohensD_ML")
+
+    df_ml = prepare_ml_df(ml_scores_all)
+    df_timing = prepare_timing_df(timing_dict)
+    df_cohensd = prepare_cohensd_df(cohensd_dict)
+
+    join_keys = choose_join_keys(df_ml, df_timing, df_cohensd)
+    print(f"Using join keys: {join_keys}")
+
+    df = merge_with_checks(df_ml, df_timing, df_cohensd, join_keys)
+    df = add_rdoc(df, args.simul_or_real)
+    df = finalize_columns(df)
+
+    csv_path, pkl_path = save_outputs(df, paths["out_dir"], args.simul_or_real)
+
+    print(f"Saved dataframe with shape: {df.shape}")
+    print(f"CSV: {csv_path}")
+    print(f"PKL: {pkl_path}")
+
+
+if __name__ == "__main__":
+    main()

From b7de7a055c2aa64004fddc454556ec53aec2851b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 20:05:12 -0400
Subject: [PATCH 365/401] minor

---
 task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index 4a11075..e4d975e 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -30,7 +30,7 @@ if [ ! -f "$SCRIPT_PATH" ]; then
 fi
 
 case "$SCRIPT_NAME" in
-  performance_predict.py | ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py | cohensd.py)
+  performance_predict.py | performance_factor.py | ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py | cohensd.py)
     python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
     ;;
   *)

From a17dfd0bdbdc3e771ce07ff3221a81e66e041355 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 20:40:08 -0400
Subject: [PATCH 366/401] bug fix

---
 task_dFC/multi_dataset_analysis/performance_factor.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 206dfc9..34f0349 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -111,8 +111,7 @@ def get_paths(multi_dataset_info, simul_or_real):
 
 
 def prepare_ml_df(ml_scores_all):
-    assert LEVEL in ml_scores_all, f"Expected top-level key '{LEVEL}' in ML scores"
-    ml_scores = ml_scores_all[LEVEL]
+    ml_scores = ml_scores_all
 
     required_keys = [
         "task",
@@ -124,7 +123,7 @@ def prepare_ml_df(ml_scores_all):
     ]
     assert_required_keys(ml_scores, required_keys, "ALL_ML_SCORES")
 
-    df_ml_wide = dict_to_df(ml_scores, "ALL_ML_SCORES[group_lvl]")
+    df_ml_wide = dict_to_df(ml_scores, "ALL_ML_SCORES")
     df_ml_wide = df_ml_wide[df_ml_wide["group"] == GROUP].copy()
     assert not df_ml_wide.empty, f"No ML rows found for group='{GROUP}'"
 

From 6471b60022da18753b505a56137222731d30cf20 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 21:47:21 -0400
Subject: [PATCH 367/401] add tsnr to performance_factor

---
 .../performance_factor.py                     | 60 +++++++++++++++++--
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 34f0349..5817900 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -31,6 +31,10 @@
     "CohensD_mean",
 ]
 
+TSNR_FEATURES = [
+    "median_tsnr_avg_over_subjects",
+]
+
 
 def parse_args():
     helptext = """
@@ -87,6 +91,10 @@ def normalize_run(value):
         return "none"
     if isinstance(value, float) and np.isnan(value):
         return "none"
+    # TSV empty cells are read by pandas as NaN (float) handled above,
+    # but guard against empty strings too (e.g. after manual editing).
+    if str(value).strip() == "":
+        return "none"
     return str(value).strip().lower()
 
 
@@ -106,6 +114,7 @@ def get_paths(multi_dataset_info, simul_or_real):
         "ml": f"{output_root}/ML_results/ALL_ML_SCORES_{simul_or_real}.npy",
         "timing": f"{output_root}/task_timing_stats/{simul_or_real}/task_timing_stats_{simul_or_real}.npy",
         "cohensd": f"{output_root}/CohensD/{simul_or_real}/CohensD_ML_{simul_or_real}.npy",
+        "tsnr": f"{output_root}/t-SNR/tsnr_summary_grouped.tsv",
         "out_dir": f"{output_root}/performance_factor/{simul_or_real}",
     }
 
@@ -186,9 +195,30 @@ def prepare_cohensd_df(cohensd_dict):
     return add_join_keys(df_cohensd)
 
 
-def choose_join_keys(df_ml, df_timing, df_cohensd):
+def prepare_tsnr_df(tsnr_path):
+    assert os.path.exists(tsnr_path), f"tSNR file does not exist: {tsnr_path}"
+    df_tsnr = pd.read_csv(tsnr_path, sep="\t")
+
+    required_cols = ["dataset", "task", "run", *TSNR_FEATURES]
+    missing = [col for col in required_cols if col not in df_tsnr.columns]
+    assert not missing, f"Missing required columns in tsnr_summary_grouped.tsv: {missing}"
+
+    df_tsnr = df_tsnr[["dataset", "task", "run", *TSNR_FEATURES]].copy()
+
+    # Validate tSNR values (allow NaN — runs with no data are left empty as specified)
+    tsnr_vals = df_tsnr["median_tsnr_avg_over_subjects"].astype(float)
+    assert (
+        tsnr_vals.dropna() > 0
+    ).all(), (
+        "median_tsnr_avg_over_subjects contains non-positive values where data is present"
+    )
+
+    return add_join_keys(df_tsnr)
+
+
+def choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr):
     has_dataset_everywhere = all(
-        "dataset" in df.columns for df in [df_ml, df_timing, df_cohensd]
+        "dataset" in df.columns for df in [df_ml, df_timing, df_cohensd, df_tsnr]
     )
 
     base_keys = ["task_key", "run_key"]
@@ -196,13 +226,15 @@ def choose_join_keys(df_ml, df_timing, df_cohensd):
 
     timing_dupes_base = df_timing.duplicated(subset=base_keys).sum()
     cohensd_dupes_base = df_cohensd.duplicated(subset=base_keys).sum()
+    tsnr_dupes_base = df_tsnr.duplicated(subset=base_keys).sum()
 
-    if timing_dupes_base == 0 and cohensd_dupes_base == 0:
+    if timing_dupes_base == 0 and cohensd_dupes_base == 0 and tsnr_dupes_base == 0:
         return base_keys
 
     if has_dataset_everywhere:
         timing_dupes_dataset = df_timing.duplicated(subset=dataset_keys).sum()
         cohensd_dupes_dataset = df_cohensd.duplicated(subset=dataset_keys).sum()
+        tsnr_dupes_dataset = df_tsnr.duplicated(subset=dataset_keys).sum()
         assert timing_dupes_dataset == 0, (
             "task_timing_stats still has duplicate rows per dataset/task/run after "
             f"normalization. duplicate_count={timing_dupes_dataset}"
@@ -211,6 +243,10 @@ def choose_join_keys(df_ml, df_timing, df_cohensd):
             "CohensD_ML still has duplicate rows per dataset/task/run after "
             f"normalization. duplicate_count={cohensd_dupes_dataset}"
         )
+        assert tsnr_dupes_dataset == 0, (
+            "tsnr_summary_grouped still has duplicate rows per dataset/task/run after "
+            f"normalization. duplicate_count={tsnr_dupes_dataset}"
+        )
         return dataset_keys
 
     raise AssertionError(
@@ -219,9 +255,10 @@ def choose_join_keys(df_ml, df_timing, df_cohensd):
     )
 
 
-def merge_with_checks(df_ml, df_timing, df_cohensd, join_keys):
+def merge_with_checks(df_ml, df_timing, df_cohensd, df_tsnr, join_keys):
     timing_cols = join_keys + TIMING_FEATURES
     cohensd_cols = join_keys + COHEN_FEATURES
+    tsnr_cols = join_keys + TSNR_FEATURES
 
     df_merged = df_ml.merge(
         df_timing[timing_cols],
@@ -249,6 +286,15 @@ def merge_with_checks(df_ml, df_timing, df_cohensd, join_keys):
     ), f"Could not match Cohen's D stats for {cohensd_unmatched} ML rows using keys {join_keys}"
     df_merged = df_merged.drop(columns=["cohensd_merge"])
 
+    # tSNR: left join — rows with no tSNR data (e.g. None-run datasets not in file)
+    # will have NaN, which is acceptable as specified.
+    df_merged = df_merged.merge(
+        df_tsnr[tsnr_cols],
+        on=join_keys,
+        how="left",
+        validate="many_to_one",
+    )
+
     return df_merged
 
 
@@ -278,6 +324,7 @@ def finalize_columns(df):
         "RDoC",
         *TIMING_FEATURES,
         *COHEN_FEATURES,
+        *TSNR_FEATURES,
         "dFC assessment method",
         "classifier model",
         "embedding",
@@ -317,11 +364,12 @@ def main():
     df_ml = prepare_ml_df(ml_scores_all)
     df_timing = prepare_timing_df(timing_dict)
     df_cohensd = prepare_cohensd_df(cohensd_dict)
+    df_tsnr = prepare_tsnr_df(paths["tsnr"])
 
-    join_keys = choose_join_keys(df_ml, df_timing, df_cohensd)
+    join_keys = choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr)
     print(f"Using join keys: {join_keys}")
 
-    df = merge_with_checks(df_ml, df_timing, df_cohensd, join_keys)
+    df = merge_with_checks(df_ml, df_timing, df_cohensd, df_tsnr, join_keys)
     df = add_rdoc(df, args.simul_or_real)
     df = finalize_columns(df)
 

From 8275a211a9453594f262fccfaceae14fa28612b7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 22:57:06 -0400
Subject: [PATCH 368/401] add perf_factor corr plot

---
 .../performance_factor.py                     | 114 +++++++++++++++++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 5817900..09e1801 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -3,11 +3,18 @@
 import os
 import sys
 
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import seaborn as sns
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-from helper_functions import RDoC_MAP, canon_task  # pyright: ignore[reportMissingImports]
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    RDoC_MAP,
+    canon_task,
+    savefig_pub,
+    setup_pub_style,
+)
 
 LEVEL = "group_lvl"
 GROUP = "test"
@@ -35,6 +42,16 @@
     "median_tsnr_avg_over_subjects",
 ]
 
+CORR_EXCLUDE_COLUMNS = {
+    "RDoC",
+    "task",
+    "run",
+    "dFC assessment method",
+    "classifier model",
+    "embedding",
+    "classification_balanced_accuracy",
+}
+
 
 def parse_args():
     helptext = """
@@ -351,8 +368,92 @@ def save_outputs(df, out_dir, simul_or_real):
     return csv_path, pkl_path
 
 
+def build_correlation_table(df):
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    factor_cols = [
+        col
+        for col in numeric_cols
+        if col not in CORR_EXCLUDE_COLUMNS and col != "classification_balanced_accuracy"
+    ]
+    assert factor_cols, "No numeric factor columns available for correlation analysis"
+
+    rows = []
+    for method, group_df in df.groupby("dFC assessment method", observed=True):
+        for factor in factor_cols:
+            pair_df = group_df[[factor, "classification_balanced_accuracy"]].dropna()
+            n_samples = len(pair_df)
+
+            if (
+                n_samples < 3
+                or pair_df[factor].nunique(dropna=True) < 2
+                or pair_df["classification_balanced_accuracy"].nunique(dropna=True) < 2
+            ):
+                corr = np.nan
+            else:
+                corr = pair_df[factor].corr(
+                    pair_df["classification_balanced_accuracy"], method="pearson"
+                )
+
+            rows.append(
+                {
+                    "factor": factor,
+                    "dFC assessment method": method,
+                    "correlation": corr,
+                    "n_samples": n_samples,
+                }
+            )
+
+    corr_df = pd.DataFrame(rows)
+    corr_df["factor"] = pd.Categorical(
+        corr_df["factor"], categories=factor_cols, ordered=True
+    )
+    corr_df = corr_df.sort_values(["factor", "dFC assessment method"]).reset_index(
+        drop=True
+    )
+    return corr_df
+
+
+def plot_factor_correlation_pointplot(corr_df, out_dir, simul_or_real):
+    valid_df = corr_df.dropna(subset=["correlation"]).copy()
+    assert (
+        not valid_df.empty
+    ), "All factor correlations are NaN; cannot generate correlation pointplot"
+
+    n_factors = valid_df["factor"].nunique()
+    width = max(10, 0.75 * n_factors)
+    height = 5.5
+
+    figure, ax = plt.subplots(figsize=(width, height))
+    sns.pointplot(
+        data=valid_df,
+        x="factor",
+        y="correlation",
+        hue="dFC assessment method",
+        dodge=0.4,
+        errorbar=None,
+        markers="o",
+        linestyles="",
+        ax=ax,
+    )
+
+    ax.axhline(0.0, color="#333333", linestyle="--", linewidth=1.0)
+    ax.set_ylim(-1.05, 1.05)
+    ax.set_xlabel("Factor")
+    ax.set_ylabel("Pearson correlation with classification balanced accuracy")
+    plt.setp(ax.get_xticklabels(), rotation=35, ha="right")
+    ax.legend(title="dFC assessment method", frameon=True)
+    sns.despine(ax=ax, top=True, right=True)
+    figure.tight_layout()
+
+    fig_path = f"{out_dir}/performance_factor_correlation_pointplot_{simul_or_real}.png"
+    savefig_pub(fig_path)
+    plt.close(figure)
+    return fig_path
+
+
 def main():
     args = parse_args()
+    setup_pub_style()
 
     multi_dataset_info = read_json(args.multi_dataset_info)
     paths = get_paths(multi_dataset_info, args.simul_or_real)
@@ -375,9 +476,20 @@ def main():
 
     csv_path, pkl_path = save_outputs(df, paths["out_dir"], args.simul_or_real)
 
+    corr_df = build_correlation_table(df)
+    corr_csv_path = (
+        f"{paths['out_dir']}/performance_factor_correlations_{args.simul_or_real}.csv"
+    )
+    corr_df.to_csv(corr_csv_path, index=False)
+    corr_fig_path = plot_factor_correlation_pointplot(
+        corr_df, paths["out_dir"], args.simul_or_real
+    )
+
     print(f"Saved dataframe with shape: {df.shape}")
     print(f"CSV: {csv_path}")
     print(f"PKL: {pkl_path}")
+    print(f"Correlation CSV: {corr_csv_path}")
+    print(f"Correlation figure: {corr_fig_path}")
 
 
 if __name__ == "__main__":

From f9104b539a4e803c2e5e7809831f01d892e37073 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 23:13:52 -0400
Subject: [PATCH 369/401] add rdoc plot

---
 .../performance_factor.py                     | 176 +++++++++++++++---
 1 file changed, 151 insertions(+), 25 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 09e1801..9747d58 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+from matplotlib.ticker import MultipleLocator
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
@@ -233,17 +234,21 @@ def prepare_tsnr_df(tsnr_path):
     return add_join_keys(df_tsnr)
 
 
-def choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr):
-    has_dataset_everywhere = all(
-        "dataset" in df.columns for df in [df_ml, df_timing, df_cohensd, df_tsnr]
-    )
+def choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr=None):
+    sources = [df_ml, df_timing, df_cohensd]
+    if df_tsnr is not None:
+        sources.append(df_tsnr)
+
+    has_dataset_everywhere = all("dataset" in df.columns for df in sources)
 
     base_keys = ["task_key", "run_key"]
     dataset_keys = ["dataset", *base_keys]
 
     timing_dupes_base = df_timing.duplicated(subset=base_keys).sum()
     cohensd_dupes_base = df_cohensd.duplicated(subset=base_keys).sum()
-    tsnr_dupes_base = df_tsnr.duplicated(subset=base_keys).sum()
+    tsnr_dupes_base = (
+        df_tsnr.duplicated(subset=base_keys).sum() if df_tsnr is not None else 0
+    )
 
     if timing_dupes_base == 0 and cohensd_dupes_base == 0 and tsnr_dupes_base == 0:
         return base_keys
@@ -251,7 +256,9 @@ def choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr):
     if has_dataset_everywhere:
         timing_dupes_dataset = df_timing.duplicated(subset=dataset_keys).sum()
         cohensd_dupes_dataset = df_cohensd.duplicated(subset=dataset_keys).sum()
-        tsnr_dupes_dataset = df_tsnr.duplicated(subset=dataset_keys).sum()
+        tsnr_dupes_dataset = (
+            df_tsnr.duplicated(subset=dataset_keys).sum() if df_tsnr is not None else 0
+        )
         assert timing_dupes_dataset == 0, (
             "task_timing_stats still has duplicate rows per dataset/task/run after "
             f"normalization. duplicate_count={timing_dupes_dataset}"
@@ -260,10 +267,11 @@ def choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr):
             "CohensD_ML still has duplicate rows per dataset/task/run after "
             f"normalization. duplicate_count={cohensd_dupes_dataset}"
         )
-        assert tsnr_dupes_dataset == 0, (
-            "tsnr_summary_grouped still has duplicate rows per dataset/task/run after "
-            f"normalization. duplicate_count={tsnr_dupes_dataset}"
-        )
+        if df_tsnr is not None:
+            assert tsnr_dupes_dataset == 0, (
+                "tsnr_summary_grouped still has duplicate rows per dataset/task/run after "
+                f"normalization. duplicate_count={tsnr_dupes_dataset}"
+            )
         return dataset_keys
 
     raise AssertionError(
@@ -272,10 +280,9 @@ def choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr):
     )
 
 
-def merge_with_checks(df_ml, df_timing, df_cohensd, df_tsnr, join_keys):
+def merge_with_checks(df_ml, df_timing, df_cohensd, join_keys, df_tsnr=None):
     timing_cols = join_keys + TIMING_FEATURES
     cohensd_cols = join_keys + COHEN_FEATURES
-    tsnr_cols = join_keys + TSNR_FEATURES
 
     df_merged = df_ml.merge(
         df_timing[timing_cols],
@@ -303,14 +310,16 @@ def merge_with_checks(df_ml, df_timing, df_cohensd, df_tsnr, join_keys):
     ), f"Could not match Cohen's D stats for {cohensd_unmatched} ML rows using keys {join_keys}"
     df_merged = df_merged.drop(columns=["cohensd_merge"])
 
-    # tSNR: left join — rows with no tSNR data (e.g. None-run datasets not in file)
-    # will have NaN, which is acceptable as specified.
-    df_merged = df_merged.merge(
-        df_tsnr[tsnr_cols],
-        on=join_keys,
-        how="left",
-        validate="many_to_one",
-    )
+    if df_tsnr is not None:
+        tsnr_cols = join_keys + TSNR_FEATURES
+        # tSNR: left join — rows with no tSNR data (e.g. None-run datasets not in file)
+        # will have NaN, which is acceptable as specified.
+        df_merged = df_merged.merge(
+            df_tsnr[tsnr_cols],
+            on=join_keys,
+            how="left",
+            validate="many_to_one",
+        )
 
     return df_merged
 
@@ -341,13 +350,16 @@ def finalize_columns(df):
         "RDoC",
         *TIMING_FEATURES,
         *COHEN_FEATURES,
-        *TSNR_FEATURES,
         "dFC assessment method",
         "classifier model",
         "embedding",
         "classification_balanced_accuracy",
     ]
 
+    if all(col in df.columns for col in TSNR_FEATURES):
+        insert_at = cols.index("dFC assessment method")
+        cols[insert_at:insert_at] = TSNR_FEATURES
+
     missing = [col for col in cols if col not in df.columns]
     assert not missing, f"Missing expected final columns: {missing}"
 
@@ -421,7 +433,7 @@ def plot_factor_correlation_pointplot(corr_df, out_dir, simul_or_real):
 
     n_factors = valid_df["factor"].nunique()
     width = max(10, 0.75 * n_factors)
-    height = 5.5
+    height = 7.0
 
     figure, ax = plt.subplots(figsize=(width, height))
     sns.pointplot(
@@ -439,8 +451,16 @@ def plot_factor_correlation_pointplot(corr_df, out_dir, simul_or_real):
     ax.axhline(0.0, color="#333333", linestyle="--", linewidth=1.0)
     ax.set_ylim(-1.05, 1.05)
     ax.set_xlabel("Factor")
-    ax.set_ylabel("Pearson correlation with classification balanced accuracy")
+    ax.set_ylabel("Corr. with balanced accuracy")
     plt.setp(ax.get_xticklabels(), rotation=35, ha="right")
+    ax.tick_params(axis="x", labelsize=11)
+    ax.tick_params(axis="y", labelsize=11)
+
+    ax.yaxis.set_major_locator(MultipleLocator(0.25))
+    ax.yaxis.set_minor_locator(MultipleLocator(0.125))
+    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.4)
+    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.22)
+
     ax.legend(title="dFC assessment method", frameon=True)
     sns.despine(ax=ax, top=True, right=True)
     figure.tight_layout()
@@ -451,6 +471,102 @@ def plot_factor_correlation_pointplot(corr_df, out_dir, simul_or_real):
     return fig_path
 
 
+def _get_present_rdoc_order(df, simul_or_real):
+    domain_order = RDoC_MAP[simul_or_real]["DOMAIN_ORDER"]
+    present = set(df["RDoC"].dropna().astype(str).unique())
+    ordered = [domain for domain in domain_order if domain in present]
+    remaining = sorted([domain for domain in present if domain not in ordered])
+    return ordered + remaining
+
+
+def plot_rdoc_overall_distribution(df, out_dir, simul_or_real):
+    rdoc_order = _get_present_rdoc_order(df, simul_or_real)
+    assert rdoc_order, "No RDoC values found for plotting"
+
+    width = max(10, 1.3 * len(rdoc_order))
+    height = 6.5
+    figure, ax = plt.subplots(figsize=(width, height))
+
+    sns.boxplot(
+        data=df,
+        x="RDoC",
+        y="classification_balanced_accuracy",
+        order=rdoc_order,
+        showfliers=False,
+        width=0.55,
+        ax=ax,
+    )
+    sns.stripplot(
+        data=df,
+        x="RDoC",
+        y="classification_balanced_accuracy",
+        order=rdoc_order,
+        color="#303030",
+        alpha=0.55,
+        size=3,
+        jitter=0.22,
+        ax=ax,
+    )
+
+    ax.set_xlabel("RDoC domain")
+    ax.set_ylabel("Balanced accuracy")
+    ax.set_ylim(0.45, 1.02)
+    plt.setp(ax.get_xticklabels(), rotation=25, ha="right")
+    ax.yaxis.set_major_locator(MultipleLocator(0.05))
+    ax.yaxis.set_minor_locator(MultipleLocator(0.025))
+    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.35)
+    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.2)
+    sns.despine(ax=ax, top=True, right=True)
+    figure.tight_layout()
+
+    fig_path = f"{out_dir}/performance_by_rdoc_overall_{simul_or_real}.png"
+    savefig_pub(fig_path)
+    plt.close(figure)
+    return fig_path
+
+
+def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
+    rdoc_order = _get_present_rdoc_order(df, simul_or_real)
+    assert rdoc_order, "No RDoC values found for plotting"
+
+    facet = sns.catplot(
+        data=df,
+        x="RDoC",
+        y="classification_balanced_accuracy",
+        hue="dFC assessment method",
+        row="classifier model",
+        col="embedding",
+        kind="box",
+        showfliers=False,
+        order=rdoc_order,
+        height=4.2,
+        aspect=1.15,
+        sharey=True,
+    )
+
+    for ax in facet.axes.flat:
+        if ax is None:
+            continue
+        ax.set_ylim(0.45, 1.02)
+        ax.yaxis.set_major_locator(MultipleLocator(0.1))
+        ax.yaxis.set_minor_locator(MultipleLocator(0.05))
+        ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.32)
+        ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.18)
+        for label in ax.get_xticklabels():
+            label.set_rotation(30)
+            label.set_horizontalalignment("right")
+
+    facet.set_axis_labels("RDoC domain", "Balanced accuracy")
+    if facet.legend is not None:
+        facet.legend.set_title("dFC assessment method")
+
+    facet.figure.tight_layout()
+    fig_path = f"{out_dir}/performance_by_rdoc_faceted_{simul_or_real}.png"
+    facet.figure.savefig(fig_path, dpi=1000, bbox_inches="tight", pad_inches=0.04)
+    plt.close(facet.figure)
+    return fig_path
+
+
 def main():
     args = parse_args()
     setup_pub_style()
@@ -465,12 +581,14 @@ def main():
     df_ml = prepare_ml_df(ml_scores_all)
     df_timing = prepare_timing_df(timing_dict)
     df_cohensd = prepare_cohensd_df(cohensd_dict)
-    df_tsnr = prepare_tsnr_df(paths["tsnr"])
+    df_tsnr = None
+    if args.simul_or_real == "real":
+        df_tsnr = prepare_tsnr_df(paths["tsnr"])
 
     join_keys = choose_join_keys(df_ml, df_timing, df_cohensd, df_tsnr)
     print(f"Using join keys: {join_keys}")
 
-    df = merge_with_checks(df_ml, df_timing, df_cohensd, df_tsnr, join_keys)
+    df = merge_with_checks(df_ml, df_timing, df_cohensd, join_keys, df_tsnr)
     df = add_rdoc(df, args.simul_or_real)
     df = finalize_columns(df)
 
@@ -484,12 +602,20 @@ def main():
     corr_fig_path = plot_factor_correlation_pointplot(
         corr_df, paths["out_dir"], args.simul_or_real
     )
+    rdoc_overall_path = plot_rdoc_overall_distribution(
+        df, paths["out_dir"], args.simul_or_real
+    )
+    rdoc_faceted_path = plot_rdoc_faceted_distribution(
+        df, paths["out_dir"], args.simul_or_real
+    )
 
     print(f"Saved dataframe with shape: {df.shape}")
     print(f"CSV: {csv_path}")
     print(f"PKL: {pkl_path}")
     print(f"Correlation CSV: {corr_csv_path}")
     print(f"Correlation figure: {corr_fig_path}")
+    print(f"RDoC overall figure: {rdoc_overall_path}")
+    print(f"RDoC faceted figure: {rdoc_faceted_path}")
 
 
 if __name__ == "__main__":

From 0c0b6dbb8336917720d828e56bbda73558a27bc2 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 23:30:27 -0400
Subject: [PATCH 370/401] add Top-vs-bottom performance profile plot

---
 .../performance_factor.py                     | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 9747d58..45096f2 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -53,6 +53,8 @@
     "classification_balanced_accuracy",
 }
 
+TOP_BOTTOM_QUANTILE = 0.2
+
 
 def parse_args():
     helptext = """
@@ -425,6 +427,17 @@ def build_correlation_table(df):
     return corr_df
 
 
+def get_numeric_factor_columns(df):
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    factor_cols = [
+        col
+        for col in numeric_cols
+        if col not in CORR_EXCLUDE_COLUMNS and col != "classification_balanced_accuracy"
+    ]
+    assert factor_cols, "No numeric factor columns available for analysis"
+    return factor_cols
+
+
 def plot_factor_correlation_pointplot(corr_df, out_dir, simul_or_real):
     valid_df = corr_df.dropna(subset=["correlation"]).copy()
     assert (
@@ -471,6 +484,101 @@ def plot_factor_correlation_pointplot(corr_df, out_dir, simul_or_real):
     return fig_path
 
 
+def build_top_bottom_profile_table(df, quantile=TOP_BOTTOM_QUANTILE):
+    assert 0 < quantile < 0.5, "quantile must be in (0, 0.5)"
+
+    factor_cols = get_numeric_factor_columns(df)
+    score = df["classification_balanced_accuracy"].astype(float)
+
+    low_thr = score.quantile(quantile)
+    high_thr = score.quantile(1 - quantile)
+
+    bottom_df = df[score <= low_thr].copy()
+    top_df = df[score >= high_thr].copy()
+
+    assert (
+        len(top_df) >= 3 and len(bottom_df) >= 3
+    ), "Not enough samples in top/bottom groups for profile analysis"
+
+    rows = []
+    for factor in factor_cols:
+        top_vals = top_df[factor].astype(float).dropna()
+        bottom_vals = bottom_df[factor].astype(float).dropna()
+        n_top = len(top_vals)
+        n_bottom = len(bottom_vals)
+
+        mean_top = np.nan if n_top == 0 else float(top_vals.mean())
+        mean_bottom = np.nan if n_bottom == 0 else float(bottom_vals.mean())
+        mean_diff = mean_top - mean_bottom if (n_top > 0 and n_bottom > 0) else np.nan
+
+        cohens_d = np.nan
+        if n_top >= 2 and n_bottom >= 2:
+            var_top = float(np.var(top_vals, ddof=1))
+            var_bottom = float(np.var(bottom_vals, ddof=1))
+            pooled_num = ((n_top - 1) * var_top) + ((n_bottom - 1) * var_bottom)
+            pooled_den = n_top + n_bottom - 2
+            if pooled_den > 0:
+                pooled_std = np.sqrt(pooled_num / pooled_den)
+                if np.isfinite(pooled_std) and pooled_std > 0:
+                    cohens_d = mean_diff / pooled_std
+
+        rows.append(
+            {
+                "factor": factor,
+                "mean_top": mean_top,
+                "mean_bottom": mean_bottom,
+                "mean_diff": mean_diff,
+                "cohens_d": cohens_d,
+                "n_top": n_top,
+                "n_bottom": n_bottom,
+                "low_threshold": float(low_thr),
+                "high_threshold": float(high_thr),
+            }
+        )
+
+    profile_df = pd.DataFrame(rows)
+    profile_df["abs_cohens_d"] = profile_df["cohens_d"].abs()
+    profile_df = profile_df.sort_values("abs_cohens_d", ascending=False).reset_index(
+        drop=True
+    )
+    return profile_df
+
+
+def plot_top_bottom_profile(profile_df, out_dir, simul_or_real):
+    valid_df = profile_df.dropna(subset=["cohens_d"]).copy()
+    assert (
+        not valid_df.empty
+    ), "No valid Cohen's d values available for top-vs-bottom profile plot"
+
+    valid_df = valid_df.sort_values("abs_cohens_d", ascending=True)
+    valid_df["factor"] = pd.Categorical(
+        valid_df["factor"], categories=valid_df["factor"].tolist(), ordered=True
+    )
+
+    height = max(6.0, 0.45 * len(valid_df))
+    figure, ax = plt.subplots(figsize=(10.5, height))
+    sns.scatterplot(
+        data=valid_df,
+        x="cohens_d",
+        y="factor",
+        s=85,
+        color="#1f4e79",
+        ax=ax,
+    )
+
+    ax.axvline(0.0, color="#333333", linestyle="--", linewidth=1.1)
+    ax.set_xlabel("Effect size (Cohen's d): Top 20% vs Bottom 20%")
+    ax.set_ylabel("Factor")
+    ax.grid(True, axis="x", which="major", linestyle="-", alpha=0.35)
+    sns.despine(ax=ax, top=True, right=True)
+    figure.tight_layout()
+
+    fig_path = f"{out_dir}/performance_top_bottom_profile_{simul_or_real}.png"
+    savefig_pub(fig_path)
+    plt.close(figure)
+    return fig_path
+
+
 def _get_present_rdoc_order(df, simul_or_real):
     domain_order = RDoC_MAP[simul_or_real]["DOMAIN_ORDER"]
     present = set(df["RDoC"].dropna().astype(str).unique())
@@ -602,6 +710,16 @@ def main():
     corr_fig_path = plot_factor_correlation_pointplot(
         corr_df, paths["out_dir"], args.simul_or_real
     )
+
+    profile_df = build_top_bottom_profile_table(df, quantile=TOP_BOTTOM_QUANTILE)
+    profile_csv_path = (
+        f"{paths['out_dir']}/performance_top_bottom_profile_{args.simul_or_real}.csv"
+    )
+    profile_df.to_csv(profile_csv_path, index=False)
+    profile_fig_path = plot_top_bottom_profile(
+        profile_df, paths["out_dir"], args.simul_or_real
+    )
+
     rdoc_overall_path = plot_rdoc_overall_distribution(
         df, paths["out_dir"], args.simul_or_real
     )
@@ -614,6 +732,8 @@ def main():
     print(f"PKL: {pkl_path}")
     print(f"Correlation CSV: {corr_csv_path}")
     print(f"Correlation figure: {corr_fig_path}")
+    print(f"Top-bottom profile CSV: {profile_csv_path}")
+    print(f"Top-bottom profile figure: {profile_fig_path}")
     print(f"RDoC overall figure: {rdoc_overall_path}")
     print(f"RDoC faceted figure: {rdoc_faceted_path}")
 

From 9c8616876ff44ec385ef85cd7e49b8ad3f01b490 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 23:43:20 -0400
Subject: [PATCH 371/401] minor

---
 .../performance_factor.py                     | 120 ++++++++++--------
 1 file changed, 68 insertions(+), 52 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 45096f2..c1e83bb 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -488,59 +488,68 @@ def build_top_bottom_profile_table(df, quantile=TOP_BOTTOM_QUANTILE):
     assert 0 < quantile < 0.5, "quantile must be in (0, 0.5)"
 
     factor_cols = get_numeric_factor_columns(df)
-    score = df["classification_balanced_accuracy"].astype(float)
 
-    low_thr = score.quantile(quantile)
-    high_thr = score.quantile(1 - quantile)
+    rows = []
+    for method, method_df in df.groupby("dFC assessment method", observed=True):
+        score = method_df["classification_balanced_accuracy"].astype(float)
+        low_thr = score.quantile(quantile)
+        high_thr = score.quantile(1 - quantile)
+
+        bottom_df = method_df[score <= low_thr].copy()
+        top_df = method_df[score >= high_thr].copy()
+
+        if len(top_df) < 3 or len(bottom_df) < 3:
+            print(
+                f"[TopBottom] Skipping method '{method}' due to too few samples "
+                f"(top={len(top_df)}, bottom={len(bottom_df)})."
+            )
+            continue
 
-    bottom_df = df[score <= low_thr].copy()
-    top_df = df[score >= high_thr].copy()
+        for factor in factor_cols:
+            top_vals = top_df[factor].astype(float).dropna()
+            bottom_vals = bottom_df[factor].astype(float).dropna()
+            n_top = len(top_vals)
+            n_bottom = len(bottom_vals)
+
+            mean_top = np.nan if n_top == 0 else float(top_vals.mean())
+            mean_bottom = np.nan if n_bottom == 0 else float(bottom_vals.mean())
+            mean_diff = mean_top - mean_bottom if (n_top > 0 and n_bottom > 0) else np.nan
+
+            cohens_d = np.nan
+            if n_top >= 2 and n_bottom >= 2:
+                var_top = float(np.var(top_vals, ddof=1))
+                var_bottom = float(np.var(bottom_vals, ddof=1))
+                pooled_num = ((n_top - 1) * var_top) + ((n_bottom - 1) * var_bottom)
+                pooled_den = n_top + n_bottom - 2
+                if pooled_den > 0:
+                    pooled_std = np.sqrt(pooled_num / pooled_den)
+                    if np.isfinite(pooled_std) and pooled_std > 0:
+                        cohens_d = mean_diff / pooled_std
 
-    assert (
-        len(top_df) >= 3 and len(bottom_df) >= 3
-    ), "Not enough samples in top/bottom groups for profile analysis"
+            rows.append(
+                {
+                    "factor": factor,
+                    "dFC assessment method": method,
+                    "mean_top": mean_top,
+                    "mean_bottom": mean_bottom,
+                    "mean_diff": mean_diff,
+                    "cohens_d": cohens_d,
+                    "n_top": n_top,
+                    "n_bottom": n_bottom,
+                    "low_threshold": float(low_thr),
+                    "high_threshold": float(high_thr),
+                    "n_method_total": int(len(method_df)),
+                }
+            )
 
-    rows = []
-    for factor in factor_cols:
-        top_vals = top_df[factor].astype(float).dropna()
-        bottom_vals = bottom_df[factor].astype(float).dropna()
-        n_top = len(top_vals)
-        n_bottom = len(bottom_vals)
-
-        mean_top = np.nan if n_top == 0 else float(top_vals.mean())
-        mean_bottom = np.nan if n_bottom == 0 else float(bottom_vals.mean())
-        mean_diff = mean_top - mean_bottom if (n_top > 0 and n_bottom > 0) else np.nan
-
-        cohens_d = np.nan
-        if n_top >= 2 and n_bottom >= 2:
-            var_top = float(np.var(top_vals, ddof=1))
-            var_bottom = float(np.var(bottom_vals, ddof=1))
-            pooled_num = ((n_top - 1) * var_top) + ((n_bottom - 1) * var_bottom)
-            pooled_den = n_top + n_bottom - 2
-            if pooled_den > 0:
-                pooled_std = np.sqrt(pooled_num / pooled_den)
-                if np.isfinite(pooled_std) and pooled_std > 0:
-                    cohens_d = mean_diff / pooled_std
-
-        rows.append(
-            {
-                "factor": factor,
-                "mean_top": mean_top,
-                "mean_bottom": mean_bottom,
-                "mean_diff": mean_diff,
-                "cohens_d": cohens_d,
-                "n_top": n_top,
-                "n_bottom": n_bottom,
-                "low_threshold": float(low_thr),
-                "high_threshold": float(high_thr),
-            }
-        )
+    assert rows, "No method had enough samples for top-vs-bottom profile analysis"
 
     profile_df = pd.DataFrame(rows)
     profile_df["abs_cohens_d"] = profile_df["cohens_d"].abs()
-    profile_df = profile_df.sort_values("abs_cohens_d", ascending=False).reset_index(
-        drop=True
-    )
+    profile_df = profile_df.sort_values(
+        ["abs_cohens_d", "factor", "dFC assessment method"],
+        ascending=[False, True, True],
+    ).reset_index(drop=True)
     return profile_df
 
 
@@ -550,26 +559,33 @@ def plot_top_bottom_profile(profile_df, out_dir, simul_or_real):
         not valid_df.empty
     ), "No valid Cohen's d values available for top-vs-bottom profile plot"
 
-    valid_df = valid_df.sort_values("abs_cohens_d", ascending=True)
+    factor_order = (
+        valid_df.groupby("factor", observed=True)["abs_cohens_d"]
+        .max()
+        .sort_values(ascending=True)
+        .index.tolist()
+    )
+    valid_df = valid_df.sort_values(["factor", "dFC assessment method"])
     valid_df["factor"] = pd.Categorical(
-        valid_df["factor"], categories=valid_df["factor"].tolist(), ordered=True
+        valid_df["factor"], categories=factor_order, ordered=True
     )
 
-    height = max(6.0, 0.45 * len(valid_df))
-    figure, ax = plt.subplots(figsize=(10.5, height))
+    height = max(6.0, 0.55 * len(factor_order))
+    figure, ax = plt.subplots(figsize=(12.5, height))
     sns.scatterplot(
         data=valid_df,
         x="cohens_d",
         y="factor",
+        hue="dFC assessment method",
         s=85,
-        color="#1f4e79",
         ax=ax,
     )
 
     ax.axvline(0.0, color="#333333", linestyle="--", linewidth=1.1)
-    ax.set_xlabel("Effect size (Cohen's d): Top 20% vs Bottom 20%")
+    ax.set_xlabel("Effect size (Cohen's d): Top 20% vs Bottom 20% within method")
     ax.set_ylabel("Factor")
     ax.grid(True, axis="x", which="major", linestyle="-", alpha=0.35)
+    ax.legend(title="dFC assessment method", frameon=True)
     sns.despine(ax=ax, top=True, right=True)
     figure.tight_layout()
 

From 5559b24f0a1e4867ebfbbf5dcd600dee34b7f57d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 13 Mar 2026 23:51:24 -0400
Subject: [PATCH 372/401] minor

---
 .../performance_factor.py                     | 91 +++++++++++++------
 1 file changed, 62 insertions(+), 29 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index c1e83bb..2d77599 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -653,42 +653,75 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
     rdoc_order = _get_present_rdoc_order(df, simul_or_real)
     assert rdoc_order, "No RDoC values found for plotting"
 
-    facet = sns.catplot(
-        data=df,
-        x="RDoC",
-        y="classification_balanced_accuracy",
-        hue="dFC assessment method",
-        row="classifier model",
-        col="embedding",
-        kind="box",
-        showfliers=False,
-        order=rdoc_order,
-        height=4.2,
-        aspect=1.15,
-        sharey=True,
+    combo_df = (
+        df[["classifier model", "embedding"]]
+        .drop_duplicates()
+        .sort_values(["classifier model", "embedding"])
     )
+    assert not combo_df.empty, "No classifier/embedding combinations found for plotting"
+
+    fig_paths = []
+    for _, combo in combo_df.iterrows():
+        classifier = combo["classifier model"]
+        embedding = combo["embedding"]
 
-    for ax in facet.axes.flat:
-        if ax is None:
+        sub_df = df[
+            (df["classifier model"] == classifier) & (df["embedding"] == embedding)
+        ].copy()
+        if sub_df.empty:
             continue
+
+        width = max(9.5, 1.45 * len(rdoc_order))
+        height = 6.8
+        figure, ax = plt.subplots(figsize=(width, height))
+
+        sns.boxplot(
+            data=sub_df,
+            x="RDoC",
+            y="classification_balanced_accuracy",
+            hue="dFC assessment method",
+            order=rdoc_order,
+            showfliers=False,
+            width=0.68,
+            ax=ax,
+        )
+
         ax.set_ylim(0.45, 1.02)
-        ax.yaxis.set_major_locator(MultipleLocator(0.1))
-        ax.yaxis.set_minor_locator(MultipleLocator(0.05))
-        ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.32)
-        ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.18)
+        ax.set_xlabel("RDoC domain")
+        ax.set_ylabel("Balanced accuracy")
+        ax.set_title(f"{classifier} | {embedding}", fontweight="bold", pad=10)
+        ax.yaxis.set_major_locator(MultipleLocator(0.05))
+        ax.yaxis.set_minor_locator(MultipleLocator(0.025))
+        ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.36)
+        ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.20)
         for label in ax.get_xticklabels():
             label.set_rotation(30)
             label.set_horizontalalignment("right")
 
-    facet.set_axis_labels("RDoC domain", "Balanced accuracy")
-    if facet.legend is not None:
-        facet.legend.set_title("dFC assessment method")
+        handles, labels = ax.get_legend_handles_labels()
+        if handles:
+            ax.legend(
+                handles,
+                labels,
+                title="dFC assessment method",
+                frameon=True,
+                loc="upper left",
+                bbox_to_anchor=(1.01, 1.0),
+                borderaxespad=0.0,
+            )
 
-    facet.figure.tight_layout()
-    fig_path = f"{out_dir}/performance_by_rdoc_faceted_{simul_or_real}.png"
-    facet.figure.savefig(fig_path, dpi=1000, bbox_inches="tight", pad_inches=0.04)
-    plt.close(facet.figure)
-    return fig_path
+        sns.despine(ax=ax, top=True, right=True)
+        figure.tight_layout()
+
+        classifier_key = str(classifier).replace(" ", "_").replace("/", "-")
+        embedding_key = str(embedding).replace(" ", "_").replace("/", "-")
+        fig_path = f"{out_dir}/performance_by_rdoc_{classifier_key}_{embedding_key}_{simul_or_real}.png"
+        savefig_pub(fig_path)
+        plt.close(figure)
+        fig_paths.append(fig_path)
+
+    assert fig_paths, "No RDoC per-combination figures were generated"
+    return fig_paths
 
 
 def main():
@@ -739,7 +772,7 @@ def main():
     rdoc_overall_path = plot_rdoc_overall_distribution(
         df, paths["out_dir"], args.simul_or_real
     )
-    rdoc_faceted_path = plot_rdoc_faceted_distribution(
+    rdoc_faceted_paths = plot_rdoc_faceted_distribution(
         df, paths["out_dir"], args.simul_or_real
     )
 
@@ -751,7 +784,7 @@ def main():
     print(f"Top-bottom profile CSV: {profile_csv_path}")
     print(f"Top-bottom profile figure: {profile_fig_path}")
     print(f"RDoC overall figure: {rdoc_overall_path}")
-    print(f"RDoC faceted figure: {rdoc_faceted_path}")
+    print(f"RDoC per-combination figures: {len(rdoc_faceted_paths)} files")
 
 
 if __name__ == "__main__":

From 6ff0c14fbe1728bb641aeb0bf5e8d5f8d15fbd9d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 14 Mar 2026 00:06:00 -0400
Subject: [PATCH 373/401] RDoC ratio plot

---
 .../performance_factor.py                     | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 2d77599..29797c1 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -54,6 +54,7 @@
 }
 
 TOP_BOTTOM_QUANTILE = 0.2
+PERFORMANCE_GROUP_LABELS = ["Low", "Medium", "High"]
 
 
 def parse_args():
@@ -603,6 +604,130 @@ def _get_present_rdoc_order(df, simul_or_real):
     return ordered + remaining
 
 
+def add_performance_group(df):
+    df = df.copy()
+    score = df["classification_balanced_accuracy"].astype(float)
+    low_thr = score.quantile(0.25)
+    high_thr = score.quantile(0.75)
+    assert (
+        low_thr < high_thr
+    ), "Performance-group thresholds collapsed; cannot form 25/50/25 groups"
+
+    df["performance_group"] = pd.cut(
+        score,
+        bins=[-np.inf, low_thr, high_thr, np.inf],
+        labels=PERFORMANCE_GROUP_LABELS,
+        include_lowest=True,
+    )
+    assert df["performance_group"].notna().all(), "Failed to assign performance groups"
+    return df, float(low_thr), float(high_thr)
+
+
+def build_rdoc_performance_group_table(df, simul_or_real):
+    df_grouped, low_thr, high_thr = add_performance_group(df)
+    rdoc_order = _get_present_rdoc_order(df_grouped, simul_or_real)
+    assert rdoc_order, "No RDoC values found for RDoC-performance grouping"
+
+    count_table = (
+        df_grouped.groupby(["RDoC", "performance_group"], observed=True)
+        .size()
+        .unstack(fill_value=0)
+        .reindex(index=rdoc_order, columns=PERFORMANCE_GROUP_LABELS, fill_value=0)
+    )
+    proportion_table = count_table.div(count_table.sum(axis=1), axis=0)
+    assert np.isclose(
+        proportion_table.sum(axis=1), 1.0
+    ).all(), "RDoC performance-group proportions do not sum to 1"
+
+    summary_long = (
+        count_table.stack()
+        .rename("count")
+        .reset_index()
+        .rename(columns={"level_1": "performance_group"})
+    )
+    summary_long["proportion"] = [
+        proportion_table.loc[row.RDoC, row.performance_group]
+        for row in summary_long.itertuples(index=False)
+    ]
+    summary_long["low_threshold"] = low_thr
+    summary_long["high_threshold"] = high_thr
+    return summary_long, count_table, proportion_table
+
+
+def plot_rdoc_performance_group_stacked_bar(proportion_table, out_dir, simul_or_real):
+    width = max(9.0, 1.35 * len(proportion_table.index))
+    figure, ax = plt.subplots(figsize=(width, 6.6))
+
+    palette = {
+        "Low": "#C44E52",
+        "Medium": "#DDCF84",
+        "High": "#4C9F70",
+    }
+    proportion_pct = proportion_table.mul(100.0)
+    bottom = np.zeros(len(proportion_pct.index))
+
+    for label in PERFORMANCE_GROUP_LABELS:
+        values = proportion_pct[label].to_numpy()
+        ax.bar(
+            proportion_pct.index,
+            values,
+            bottom=bottom,
+            label=label,
+            color=palette[label],
+            edgecolor="white",
+            linewidth=0.8,
+        )
+        bottom += values
+
+    ax.set_xlabel("RDoC domain")
+    ax.set_ylabel("Samples (%)")
+    ax.set_ylim(0, 100)
+    ax.yaxis.set_major_locator(MultipleLocator(10))
+    ax.yaxis.set_minor_locator(MultipleLocator(5))
+    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.35)
+    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.18)
+    plt.setp(ax.get_xticklabels(), rotation=25, ha="right")
+    ax.legend(title="Performance group", frameon=True)
+    sns.despine(ax=ax, top=True, right=True)
+    figure.tight_layout()
+
+    fig_path = f"{out_dir}/performance_group_by_rdoc_stacked_{simul_or_real}.png"
+    savefig_pub(fig_path)
+    plt.close(figure)
+    return fig_path
+
+
+def plot_rdoc_performance_group_heatmap(proportion_table, out_dir, simul_or_real):
+    annot_table = proportion_table.mul(100.0).applymap(lambda value: f"{value:.1f}%")
+
+    figure, ax = plt.subplots(figsize=(7.4, max(4.8, 0.7 * len(proportion_table.index))))
+    heatmap = sns.heatmap(
+        proportion_table.loc[:, PERFORMANCE_GROUP_LABELS],
+        cmap="YlGnBu",
+        vmin=0.0,
+        vmax=1.0,
+        annot=annot_table.loc[:, PERFORMANCE_GROUP_LABELS],
+        fmt="",
+        linewidths=0.7,
+        linecolor="white",
+        cbar_kws={"shrink": 0.8, "pad": 0.02},
+        ax=ax,
+    )
+    colorbar = heatmap.collections[0].colorbar
+    colorbar.set_label("Proportion", fontweight="bold")
+
+    ax.set_xlabel("Performance group")
+    ax.set_ylabel("RDoC domain")
+    plt.setp(ax.get_xticklabels(), rotation=0)
+    plt.setp(ax.get_yticklabels(), rotation=0)
+    figure.tight_layout()
+
+    fig_path = f"{out_dir}/performance_group_by_rdoc_heatmap_{simul_or_real}.png"
+    savefig_pub(fig_path)
+    plt.close(figure)
+    return fig_path
+
+
 def plot_rdoc_overall_distribution(df, out_dir, simul_or_real):
     rdoc_order = _get_present_rdoc_order(df, simul_or_real)
     assert rdoc_order, "No RDoC values found for plotting"
@@ -775,6 +900,19 @@ def main():
     rdoc_faceted_paths = plot_rdoc_faceted_distribution(
         df, paths["out_dir"], args.simul_or_real
     )
+    rdoc_group_long_df, rdoc_group_count_table, rdoc_group_prop_table = (
+        build_rdoc_performance_group_table(df, args.simul_or_real)
+    )
+    rdoc_group_csv_path = (
+        f"{paths['out_dir']}/performance_group_by_rdoc_{args.simul_or_real}.csv"
+    )
+    rdoc_group_long_df.to_csv(rdoc_group_csv_path, index=False)
+    rdoc_group_bar_path = plot_rdoc_performance_group_stacked_bar(
+        rdoc_group_prop_table, paths["out_dir"], args.simul_or_real
+    )
+    rdoc_group_heatmap_path = plot_rdoc_performance_group_heatmap(
+        rdoc_group_prop_table, paths["out_dir"], args.simul_or_real
+    )
 
     print(f"Saved dataframe with shape: {df.shape}")
     print(f"CSV: {csv_path}")
@@ -785,6 +923,9 @@ def main():
     print(f"Top-bottom profile figure: {profile_fig_path}")
     print(f"RDoC overall figure: {rdoc_overall_path}")
     print(f"RDoC per-combination figures: {len(rdoc_faceted_paths)} files")
+    print(f"RDoC performance-group CSV: {rdoc_group_csv_path}")
+    print(f"RDoC performance-group stacked bar: {rdoc_group_bar_path}")
+    print(f"RDoC performance-group heatmap: {rdoc_group_heatmap_path}")
 
 
 if __name__ == "__main__":

From f2d784bede252043ee11a12d690509e1b3a76136 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 14 Mar 2026 15:02:24 -0400
Subject: [PATCH 374/401] adjust RDoC figure sizes

---
 .../performance_factor.py                     | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 29797c1..b1c73d5 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -785,6 +785,17 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
     )
     assert not combo_df.empty, "No classifier/embedding combinations found for plotting"
 
+    n_methods = df["dFC assessment method"].nunique()
+    # Generous per-domain width so boxes never feel cramped
+    n_domains = len(rdoc_order)
+    # Each domain gets ~2.8 in; minimum figure width 18 in
+    axes_width = max(18.0, 2.8 * n_domains)
+    # Reserve ~3.5 in on the right for the legend column
+    legend_width = 3.5
+    total_width = axes_width + legend_width
+    # Height: 8 in gives comfortable y-axis room; scale slightly with methods
+    height = max(8.0, 0.35 * n_methods + 6.5)
+
     fig_paths = []
     for _, combo in combo_df.iterrows():
         classifier = combo["classifier model"]
@@ -796,9 +807,7 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
         if sub_df.empty:
             continue
 
-        width = max(9.5, 1.45 * len(rdoc_order))
-        height = 6.8
-        figure, ax = plt.subplots(figsize=(width, height))
+        figure, ax = plt.subplots(figsize=(total_width, height))
 
         sns.boxplot(
             data=sub_df,
@@ -807,14 +816,21 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
             hue="dFC assessment method",
             order=rdoc_order,
             showfliers=False,
-            width=0.68,
+            width=0.72,
+            linewidth=1.4,
             ax=ax,
         )
 
         ax.set_ylim(0.45, 1.02)
-        ax.set_xlabel("RDoC domain")
-        ax.set_ylabel("Balanced accuracy")
-        ax.set_title(f"{classifier} | {embedding}", fontweight="bold", pad=10)
+        ax.set_xlabel("RDoC domain", labelpad=12, fontsize=14)
+        ax.set_ylabel("Balanced accuracy", labelpad=12, fontsize=14)
+        ax.set_title(
+            f"{classifier}  |  {embedding}",
+            fontweight="bold",
+            pad=14,
+            fontsize=15,
+        )
+        ax.tick_params(axis="both", labelsize=12)
         ax.yaxis.set_major_locator(MultipleLocator(0.05))
         ax.yaxis.set_minor_locator(MultipleLocator(0.025))
         ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.36)
@@ -822,26 +838,33 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
         for label in ax.get_xticklabels():
             label.set_rotation(30)
             label.set_horizontalalignment("right")
+            label.set_fontsize(13)
 
         handles, labels = ax.get_legend_handles_labels()
         if handles:
-            ax.legend(
+            ax.get_legend().remove()
+            figure.legend(
                 handles,
                 labels,
                 title="dFC assessment method",
+                title_fontsize=12,
+                fontsize=11,
                 frameon=True,
-                loc="upper left",
-                bbox_to_anchor=(1.01, 1.0),
-                borderaxespad=0.0,
+                loc="center left",
+                bbox_to_anchor=(axes_width / total_width + 0.01, 0.5),
             )
 
         sns.despine(ax=ax, top=True, right=True)
-        figure.tight_layout()
+        # Leave right margin for the figure-level legend
+        figure.tight_layout(rect=[0, 0, axes_width / total_width, 1])
 
         classifier_key = str(classifier).replace(" ", "_").replace("/", "-")
         embedding_key = str(embedding).replace(" ", "_").replace("/", "-")
-        fig_path = f"{out_dir}/performance_by_rdoc_{classifier_key}_{embedding_key}_{simul_or_real}.png"
-        savefig_pub(fig_path)
+        fig_path = (
+            f"{out_dir}/performance_by_rdoc_{classifier_key}"
+            f"_{embedding_key}_{simul_or_real}.png"
+        )
+        plt.savefig(fig_path, bbox_inches="tight", dpi=150)
         plt.close(figure)
         fig_paths.append(fig_path)
 

From b92f9261b1f86be83f8036cd1437861bc4cb4bc1 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 18 Mar 2026 18:51:07 -0400
Subject: [PATCH 375/401] change EXP ids

---
 .../helper_functions.py                       | 63 +++++++++----------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index c5e3f94..b4a9ff1 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -157,50 +157,49 @@ def savefig_pub(path_png_or_pdf: str):
 
 DEFAULT_EXPERIMENT_NAME_MAP = {
     "real": {
-        "emotionregulation": "EXP.18",
+        "emotionregulation": "EXP.17",
         "audsem": "EXP.3",
         "visrhyme": "EXP.4",
         "vissem": "EXP.5",
         "visspell": "EXP.6",
-        "arithmetic": "EXP.24",
-        "stroop": "EXP.15",
-        "cuedts": "EXP.13",
-        "axcpt": "EXP.12",
-        "matching": "EXP.25",
-        "stern": "EXP.14",
-        "st": "EXP.29",
-        "vswm": "EXP.26",
-        "expo": "EXP.20",
-        "recall": "EXP.21",
-        "feedback": "EXP.22",
+        "arithmetic": "EXP.23",
+        "stroop": "EXP.14",
+        "cuedts": "EXP.12",
+        "axcpt": "EXP.11",
+        "matching": "EXP.24",
+        "stern": "EXP.13",
+        "st": "EXP.28",
+        "vswm": "EXP.25",
+        "expo": "EXP.19",
+        "recall": "EXP.20",
+        "feedback": "EXP.21",
         "ppalocalizer": "EXP.2",
-        "localiser": "EXP.27",
-        "localizer": "EXP.28",
-        "cic": "EXP.11",
+        "localiser": "EXP.26",
+        "localizer": "EXP.27",
         "fribbids": "EXP.10",
         "risk": "EXP.9",
         "itc": "EXP.8",
         "fearlearning": "EXP.1",
-        "paingen": "EXP.23",
-        "motor": "EXP.19",
-        "execution": "EXP.16",
-        "imagery": "EXP.17",
+        "paingen": "EXP.22",
+        "motor": "EXP.18",
+        "execution": "EXP.15",
+        "imagery": "EXP.16",
         "ihg": "EXP.7",
     },
     "simulated": {
-        "lowfreqlongrest": "EXP.1",
-        "lowfreqshortrest": "EXP.2",
-        "lowfreqshorttask": "EXP.3",
-        "axcpt": "EXP.4",
-        "stern": "EXP.5",
-        "cuedts": "EXP.6",
-        "execution": "EXP.7",
-        "imagery": "EXP.8",
-        "localizer": "EXP.9",
-        "ppalocalizer": "EXP.10",
-        "itc": "EXP.11",
-        "stroop": "EXP.12",
-        "risk": "EXP.13",
+        "lowfreqlongrest": "EXP.S.29",
+        "lowfreqshortrest": "EXP.S.30",
+        "lowfreqshorttask": "EXP.S.31",
+        "axcpt": "EXP.S.11",
+        "stern": "EXP.S.13",
+        "cuedts": "EXP.S.12",
+        "execution": "EXP.S.15",
+        "imagery": "EXP.S.16",
+        "localizer": "EXP.S.27",
+        "ppalocalizer": "EXP.S.2",
+        "itc": "EXP.S.8",
+        "stroop": "EXP.S.14",
+        "risk": "EXP.S.9",
     },
 }
 

From 7c4f04c529b7ce921500f44e86bc81e61194bf42 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 18 Mar 2026 19:52:24 -0400
Subject: [PATCH 376/401] minor

---
 task_dFC/multi_dataset_analysis/helper_functions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/helper_functions.py b/task_dFC/multi_dataset_analysis/helper_functions.py
index b4a9ff1..8f06217 100644
--- a/task_dFC/multi_dataset_analysis/helper_functions.py
+++ b/task_dFC/multi_dataset_analysis/helper_functions.py
@@ -139,6 +139,7 @@ def savefig_pub(path_png_or_pdf: str):
             "axcpt": "Strong Performance on Real Data",
             "stern": "Strong Performance on Real Data",
             "cuedts": "Strong Performance on Real Data",
+            "stroop": "Strong Performance on Real Data",
             # Optimal Paradigm Design, Weak Performance on Real Data
             "execution": "Weak Performance on Real Data",
             "imagery": "Weak Performance on Real Data",
@@ -146,7 +147,6 @@ def savefig_pub(path_png_or_pdf: str):
             "ppalocalizer": "Weak Performance on Real Data",
             # Sub-Optimal Paradigm Design, Weak Performance on Real Data
             "itc": "Weak Performance on Real Data",
-            "stroop": "Weak Performance on Real Data",
             "risk": "Weak Performance on Real Data",
         },
     },
@@ -191,14 +191,14 @@ def savefig_pub(path_png_or_pdf: str):
         "lowfreqshortrest": "EXP.S.30",
         "lowfreqshorttask": "EXP.S.31",
         "axcpt": "EXP.S.11",
-        "stern": "EXP.S.13",
         "cuedts": "EXP.S.12",
+        "stern": "EXP.S.13",
+        "stroop": "EXP.S.14",
         "execution": "EXP.S.15",
         "imagery": "EXP.S.16",
         "localizer": "EXP.S.27",
         "ppalocalizer": "EXP.S.2",
         "itc": "EXP.S.8",
-        "stroop": "EXP.S.14",
         "risk": "EXP.S.9",
     },
 }

From e19ef8c2b41206dea0ddc370a27e8c84590579b4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 18 Mar 2026 21:08:54 -0400
Subject: [PATCH 377/401] improvements in ml_results

---
 task_dFC/multi_dataset_analysis/ml_results.py | 36 ++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/ml_results.py b/task_dFC/multi_dataset_analysis/ml_results.py
index 2312743..f522886 100644
--- a/task_dFC/multi_dataset_analysis/ml_results.py
+++ b/task_dFC/multi_dataset_analysis/ml_results.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import seaborn as sns
 from matplotlib.colors import to_rgba
+from matplotlib.ticker import PercentFormatter
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
@@ -40,6 +41,7 @@
 TOP_EXPERIMENT_MARKERS = ["*"]  # star for all top experiments
 COLOR_THRESHOLD = 60.0
 PER_METHOD_LABEL_SCORE_THRESHOLD = 55.0
+SIMULATED_METHOD_MEDIAN_ANNOTATION_THRESHOLD = 80.0
 NEUTRAL_COLOR = "#D49B9B"
 
 
@@ -386,14 +388,26 @@ def annotate_per_method_quartile(
     point_coordinates,
     method_order,
     colored_experiments,
+    metric,
+    simul_or_real,
     score_threshold=PER_METHOD_LABEL_SCORE_THRESHOLD,
 ):
     """
-    For each method, annotate points in the top quartile (>75th percentile)
-    if score > threshold. Position annotation left/right based on point position.
+    Default behavior:
+    - annotate colored experiments in top quartile and above score_threshold.
+
+    Simulated + non-SI override:
+    - if a method median is above SIMULATED_METHOD_MEDIAN_ANNOTATION_THRESHOLD,
+      annotate all experiments for that method.
     """
-    if not colored_experiments:
+    simulated_non_si = simul_or_real == "simulated" and metric != "SI"
+    if not colored_experiments and not simulated_non_si:
         return
+
+    simulated_median_threshold = convert_threshold_to_score_scale(
+        SIMULATED_METHOD_MEDIAN_ANNOTATION_THRESHOLD, metric
+    )
+
     xticks = ax.get_xticks()
     xticklabels = [t.get_text() for t in ax.get_xticklabels()]
     method_positions = {lab: xticks[i] for i, lab in enumerate(xticklabels)}
@@ -406,11 +420,14 @@ def annotate_per_method_quartile(
         scores = method_df["score"].values
         quartile_threshold = np.percentile(scores, 75)
 
-        qualify_rows = method_df[
-            method_df["experiment"].isin(colored_experiments)
-            & (method_df["score"] > score_threshold)
-            & (method_df["score"] >= quartile_threshold)
-        ]
+        if simulated_non_si and np.nanmedian(scores) > simulated_median_threshold:
+            qualify_rows = method_df
+        else:
+            qualify_rows = method_df[
+                method_df["experiment"].isin(colored_experiments)
+                & (method_df["score"] > score_threshold)
+                & (method_df["score"] >= quartile_threshold)
+            ]
 
         method_center = method_positions[method]
 
@@ -546,6 +563,8 @@ def plot_best_pointplot(
         point_coordinates,
         method_order,
         colored_experiments=colored_experiments,
+        metric=metric,
+        simul_or_real=simul_or_real,
         score_threshold=label_threshold,
     )
 
@@ -555,6 +574,7 @@ def plot_best_pointplot(
         ax.set_ylim(top=1.02)
     else:
         ax.set_ylim(0.48, 1.02)
+        ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0, decimals=0))
     ax.grid(True, axis="y", color="#FFFFFF", alpha=0.85, linewidth=1.1)
     sns.despine(ax=ax, top=True, right=True)
     plt.setp(ax.get_xticklabels(), rotation=35, ha="right")

From 1685cab5a148cac7daeccc5dbba684c4f201b980 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 22 Mar 2026 12:54:14 -0400
Subject: [PATCH 378/401] add optimality index

---
 .../performance_factor.py                     |  1 +
 .../task_timing_stats.py                      | 69 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index b1c73d5..492fb93 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -28,6 +28,7 @@
 TIMING_FEATURES = [
     "task_ratio_avg",
     "transition_freq_avg",
+    "OI_avg",
     "rest_durations_median",
     "task_durations_median",
     "rest_durations_iqr",
diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
index 4082176..06032a2 100644
--- a/task_dFC/multi_dataset_analysis/task_timing_stats.py
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -15,6 +15,7 @@
     calc_rest_duration,
     calc_task_duration,
     calc_transition_freq,
+    compute_optimality_index,
     extract_task_presence,
 )
 
@@ -76,6 +77,7 @@
     transition_freq_all = {}
     rest_durations_all = {}
     task_durations_all = {}
+    OI_all = {}
     DATA = {
         "task": [],
         "run": [],
@@ -86,6 +88,7 @@
         "task_durations_median": [],
         "rest_durations_iqr": [],
         "task_durations_iqr": [],
+        "OI_avg": [],
     }
     for dataset in DATASETS:
 
@@ -124,6 +127,7 @@
                     transition_freq_run = []
                     rest_durations_run = []
                     task_durations_run = []
+                    OI_run = []
 
                     SUBJECTS = find_subj_list(roi_root)
                     # print(f"Number of subjects: {len(SUBJECTS)}")
@@ -162,11 +166,19 @@
                         task_durations = calc_task_duration(
                             event_labels, TR_mri=1 / task_data["Fs_task"]
                         )
+                        # calculate Optimality Index
+                        out = compute_optimality_index(
+                            event_labels=event_labels,
+                            TR_task=1 / task_data["Fs_task"],
+                            TR_mri=task_data["TR_mri"],
+                        )
+                        OI = out["OI_norm"]
 
                         task_ratio_run.append(relative_task_on)
                         transition_freq_run.append(relative_transition_freq)
                         rest_durations_run.extend(rest_durations)
                         task_durations_run.extend(task_durations)
+                        OI_run.append(OI)
 
                     # Aggregate stats across runs for this task and store in the all-run dictionaries for later plotting
                     if not task in task_ratio_all:
@@ -177,10 +189,13 @@
                         rest_durations_all[task] = []
                     if not task in task_durations_all:
                         task_durations_all[task] = []
+                    if not task in OI_all:
+                        OI_all[task] = []
                     task_ratio_all[task].extend(task_ratio_run)
                     transition_freq_all[task].extend(transition_freq_run)
                     rest_durations_all[task].extend(rest_durations_run)
                     task_durations_all[task].extend(task_durations_run)
+                    OI_all[task].extend(OI_run)
 
                     # Aggregate run-level stats for this task and store in DATA for potential further analysis
                     DATA["task"].append(task)
@@ -196,6 +211,7 @@
                     iqr_task = q75_task - q25_task
                     DATA["rest_durations_iqr"].append(iqr_rest)
                     DATA["task_durations_iqr"].append(iqr_task)
+                    DATA["OI_avg"].append(np.nanmean(OI_run))
 
     np.save(f"{output_root}/task_timing_stats_{simul_or_real}.npy", DATA)
 
@@ -416,4 +432,57 @@
         pad_inches=fig_pad,
     )
     plt.close()
+
+    # ======================================
+    # 4) Optimality Index (sorted by median) — BOX PLOT + median labels
+    # ======================================
+    order_oi, stats_oi = order_by_median_dict(OI_all, reverse=True)
+    df_oi = as_long_df(OI_all, "OI_avg")
+    df_oi = df_oi[df_oi["task"].isin(order_oi)]
+    df_oi["task"] = pd.Categorical(df_oi["task"], categories=order_oi, ordered=True)
+
+    fig_w = max(15, 15 / 30 * len(order_oi))
+    plt.figure(figsize=(fig_w, 6))
+
+    ax = sns.boxplot(
+        data=df_oi,
+        x="task",
+        y="OI_avg",
+        order=order_oi,
+        width=0.6,
+        linewidth=1,
+        showfliers=False,
+    )
+
+    ax.set_xlabel("Task paradigm")
+    ax.set_ylabel("Optimality Index")
+
+    # annotate medians
+    annotate_medians_single_boxplot(
+        ax,
+        df_oi,
+        x_col="task",
+        y_col="OI_avg",
+        order=order_oi,
+        fmt="{:.2f}",
+        box_alpha=0.6,
+    )
+
+    for label in ax.get_xticklabels():
+        label.set_rotation(65)
+        label.set_horizontalalignment("right")
+        label.set_fontweight("bold")
+    if show_title:
+        ax.set_title(
+            "Optimality Index per task (box + samples, ordered by median)", pad=12
+        )
+
+    plt.tight_layout()
+    plt.savefig(
+        f"{output_root}/optimality_index_{simul_or_real}.{save_fig_format}",
+        bbox_inches=fig_bbox_inches,
+        pad_inches=fig_pad,
+    )
+    plt.close()
+
     # =========================================================

From fc7dd8ac4297ebef1df82ccdc7f5bbc5baa6823c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 22 Mar 2026 13:04:31 -0400
Subject: [PATCH 379/401] change task paradigm to experiment name in
 visualizations

---
 task_dFC/multi_dataset_analysis/cohensd.py    | 37 +++++++--
 .../dfc_visualization.py                      | 13 +++-
 .../task_presence_binarization.py             | 19 ++++-
 .../task_timing_stats.py                      | 78 +++++++++++++------
 4 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index 526054f..f3801e2 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import os
+import sys
 
 import matplotlib.pyplot as plt
 import nibabel as nib
@@ -13,6 +14,11 @@
 from pydfc.ml_utils import find_available_subjects, load_task_data
 from pydfc.task_utils import cohen_d_bold, extract_task_presence
 
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    build_experiment_display_info,
+)
+
 #######################################################################################
 
 if __name__ == "__main__":
@@ -315,6 +321,13 @@
     # Build dataframe if not already done
     DF = pd.DataFrame.from_dict(CohensD_across_task)
 
+    task_order_reference, task_to_experiment, _, _ = build_experiment_display_info(
+        tasks_iterable=DF["task"].unique().tolist(),
+        task_reference_order=TASKS_to_include,
+        simul_or_real=simul_or_real,
+    )
+    DF["experiment"] = DF["task"].map(task_to_experiment)
+
     # Use absolute Cohen's d
     DF["abs_d"] = DF["d_values"].abs()
 
@@ -326,6 +339,8 @@
         .reset_index(name="abs_max")
     )
     task_order = max_abs_per_task["task"].tolist()
+    experiment_order = [task_to_experiment[task] for task in task_order]
+    max_abs_per_task["experiment"] = max_abs_per_task["task"].map(task_to_experiment)
 
     # Dynamic width so labels don't collide (0.6 inch per task, min 14 inches)
     fig_width = max(14, 0.6 * len(task_order))
@@ -335,15 +350,20 @@
 
     # Boxplot (hide outliers to avoid double-plotting with the samples)
     ax = sns.boxplot(
-        data=DF, x="task", y="abs_d", order=task_order, showfliers=False, width=0.6
+        data=DF,
+        x="experiment",
+        y="abs_d",
+        order=experiment_order,
+        showfliers=False,
+        width=0.6,
     )
 
     # Overlay individual samples (one point per ROI sample)
     sns.stripplot(
         data=DF,
-        x="task",
+        x="experiment",
         y="abs_d",
-        order=task_order,
+        order=experiment_order,
         dodge=False,
         jitter=0.25,
         size=2,
@@ -351,7 +371,7 @@
         ax=ax,
     )
 
-    ax.set_xlabel("Task")
+    ax.set_xlabel("Experiment")
     ax.set_ylabel("|Cohen's d|")
     ax.set_ylim(bottom=0)
     ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
@@ -369,7 +389,12 @@
     # -------- Figure 2: Max |Cohen's d| across ROIs per task --------
     plt.figure(figsize=(fig_width, 6))
 
-    ax = sns.barplot(data=max_abs_per_task, x="task", y="abs_max", order=task_order)
+    ax = sns.barplot(
+        data=max_abs_per_task,
+        x="experiment",
+        y="abs_max",
+        order=experiment_order,
+    )
 
     # Optional: annotate bars with values (trim to 2 decimals)
     for p in ax.patches:
@@ -384,7 +409,7 @@
             fontsize=8,
         )
 
-    ax.set_xlabel("Task")
+    ax.set_xlabel("Experiment")
     ax.set_ylabel("Max |Cohen's d|")
     ax.set_ylim(bottom=0)
     ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
diff --git a/task_dFC/multi_dataset_analysis/dfc_visualization.py b/task_dFC/multi_dataset_analysis/dfc_visualization.py
index b4395ef..52199d8 100644
--- a/task_dFC/multi_dataset_analysis/dfc_visualization.py
+++ b/task_dFC/multi_dataset_analysis/dfc_visualization.py
@@ -8,6 +8,7 @@
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 from helper_functions import (  # pyright: ignore[reportMissingImports]
+    build_experiment_display_info,
     figure_dfc_matrices_window_png,
 )
 
@@ -54,6 +55,12 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
+    _, task_to_experiment, _, _ = build_experiment_display_info(
+        tasks_iterable=TASKS_to_include,
+        task_reference_order=TASKS_to_include,
+        simul_or_real=simul_or_real,
+    )
+
     for dataset in DATASETS:
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
         roi_root = f"{main_root}/{dataset}/derivatives/ROI_timeseries"
@@ -135,7 +142,11 @@
                 common_TRs,
                 window_len=10,
                 cmap="plasma",
-                outfile=f"{output_root}/dFC_{dataset}_{task}_mid_10.png",
+                outfile=(
+                    f"{output_root}/dFC_{dataset}_"
+                    f"{task_to_experiment.get(task, task).replace(' ', '_').replace('/', '-')}_"
+                    f"{task}_mid_10.png"
+                ),
                 dpi=600,
             )
 
diff --git a/task_dFC/multi_dataset_analysis/task_presence_binarization.py b/task_dFC/multi_dataset_analysis/task_presence_binarization.py
index a736a74..dac9547 100644
--- a/task_dFC/multi_dataset_analysis/task_presence_binarization.py
+++ b/task_dFC/multi_dataset_analysis/task_presence_binarization.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import os
+import sys
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -8,6 +9,11 @@
 from pydfc.ml_utils import find_available_subjects, load_task_data
 from pydfc.task_utils import extract_task_presence
 
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from helper_functions import (  # pyright: ignore[reportMissingImports]
+    build_experiment_display_info,
+)
+
 #######################################################################################
 
 if __name__ == "__main__":
@@ -49,6 +55,12 @@
     if not os.path.exists(output_root):
         os.makedirs(output_root)
 
+    _, task_to_experiment, _, _ = build_experiment_display_info(
+        tasks_iterable=TASKS_to_include,
+        task_reference_order=TASKS_to_include,
+        simul_or_real=simul_or_real,
+    )
+
     for dataset in DATASETS:
         print(f"Processing dataset: {dataset}")
         dataset_info_file = f"{main_root}/{dataset}/codes/dataset_info.json"
@@ -200,8 +212,11 @@
                 plt.gca().set_xticklabels(time_labels, fontsize=50)
                 plt.xlabel("Time (sec)", fontsize=60)
 
+                experiment_label = task_to_experiment.get(task, task)
+                experiment_key = str(experiment_label).replace(" ", "_").replace("/", "-")
+
                 plt.savefig(
-                    f"{output_root}/task_timing_{task}.png",
+                    f"{output_root}/task_timing_{experiment_key}_{task}.png",
                     dpi=120,
                     bbox_inches="tight",
                     pad_inches=0.1,
@@ -209,7 +224,7 @@
                 )
                 if task == "task-Localizer":
                     plt.savefig(
-                        f"{output_root}/task_timing_{task}.svg",
+                        f"{output_root}/task_timing_{experiment_key}_{task}.svg",
                         dpi=120,
                         bbox_inches="tight",
                         pad_inches=0.1,
diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
index 06032a2..daeb413 100644
--- a/task_dFC/multi_dataset_analysis/task_timing_stats.py
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -24,6 +24,7 @@
     annotate_medians_by_geometry,
     annotate_medians_single_boxplot,
     as_long_df,
+    build_experiment_display_info,
     order_by_median_dict,
     setup_pub_style,
 )
@@ -215,6 +216,19 @@
 
     np.save(f"{output_root}/task_timing_stats_{simul_or_real}.npy", DATA)
 
+    all_tasks_present = sorted(
+        set(task_ratio_all)
+        | set(transition_freq_all)
+        | set(rest_durations_all)
+        | set(task_durations_all)
+        | set(OI_all)
+    )
+    _, task_to_experiment, _, _ = build_experiment_display_info(
+        tasks_iterable=all_tasks_present,
+        task_reference_order=TASKS_to_include,
+        simul_or_real=simul_or_real,
+    )
+
     # =========================
     # Paper-quality seaborn plots (patched)
     # =========================
@@ -242,8 +256,10 @@
     order_ratio, stats_ratio = order_by_median_dict(task_ratio_all, reverse=True)
     df_ratio = as_long_df(task_ratio_all, "task_ratio")
     df_ratio = df_ratio[df_ratio["task"].isin(order_ratio)]
-    df_ratio["task"] = pd.Categorical(
-        df_ratio["task"], categories=order_ratio, ordered=True
+    order_ratio_exp = [task_to_experiment[task] for task in order_ratio]
+    df_ratio["experiment"] = df_ratio["task"].map(task_to_experiment)
+    df_ratio["experiment"] = pd.Categorical(
+        df_ratio["experiment"], categories=order_ratio_exp, ordered=True
     )
 
     fig_w = max(15, 15 / 30 * len(order_ratio))
@@ -251,15 +267,15 @@
 
     ax = sns.boxplot(
         data=df_ratio,
-        x="task",
+        x="experiment",
         y="task_ratio",
-        order=order_ratio,
+        order=order_ratio_exp,
         width=0.6,
         linewidth=1,
         showfliers=False,
     )
 
-    ax.set_xlabel("Task paradigm")
+    ax.set_xlabel("Experiment")
     ax.set_ylabel("Task ratio")
     ax.set_ylim(0, 1)  # keep ratios bounded
 
@@ -267,9 +283,9 @@
     annotate_medians_single_boxplot(
         ax,
         df_ratio,
-        x_col="task",
+        x_col="experiment",
         y_col="task_ratio",
-        order=order_ratio,
+        order=order_ratio_exp,
         fmt="{:.2f}",
         box_alpha=0.6,
     )
@@ -295,31 +311,35 @@
     order_tf, stats_tf = order_by_median_dict(transition_freq_all, reverse=True)
     df_tf = as_long_df(transition_freq_all, "transition_freq")
     df_tf = df_tf[df_tf["task"].isin(order_tf)]
-    df_tf["task"] = pd.Categorical(df_tf["task"], categories=order_tf, ordered=True)
+    order_tf_exp = [task_to_experiment[task] for task in order_tf]
+    df_tf["experiment"] = df_tf["task"].map(task_to_experiment)
+    df_tf["experiment"] = pd.Categorical(
+        df_tf["experiment"], categories=order_tf_exp, ordered=True
+    )
 
     fig_w = max(15, 15 / 30 * len(order_tf))
     plt.figure(figsize=(fig_w, 6))
 
     ax = sns.boxplot(
         data=df_tf,
-        x="task",
+        x="experiment",
         y="transition_freq",
-        order=order_tf,
+        order=order_tf_exp,
         width=0.6,
         linewidth=1,
         showfliers=False,
     )
 
-    ax.set_xlabel("Task paradigm")
+    ax.set_xlabel("Experiment")
     ax.set_ylabel("Relative transition frequency")
 
     # annotate medians
     annotate_medians_single_boxplot(
         ax,
         df_tf,
-        x_col="task",
+        x_col="experiment",
         y_col="transition_freq",
-        order=order_tf,
+        order=order_tf_exp,
         fmt="{:.2f}",
         box_alpha=0.6,
     )
@@ -353,7 +373,11 @@
     # Order tasks by mean Task duration (change to Rest if you prefer)
     order_dur, _ = order_by_median_dict(task_durations_all, reverse=True)
     df_dur = df_dur[df_dur["task"].isin(order_dur)]
-    df_dur["task"] = pd.Categorical(df_dur["task"], categories=order_dur, ordered=True)
+    order_dur_exp = [task_to_experiment[task] for task in order_dur]
+    df_dur["experiment"] = df_dur["task"].map(task_to_experiment)
+    df_dur["experiment"] = pd.Categorical(
+        df_dur["experiment"], categories=order_dur_exp, ordered=True
+    )
 
     # ---- LOG display handling (avoid -inf for zeros) ----
     # pick an adaptive epsilon based on the smallest positive value
@@ -370,10 +394,10 @@
     # Boxplot on log scale (no fliers; jitters will show samples, incl. singletons)
     ax = sns.boxplot(
         data=df_dur,
-        x="task",
+        x="experiment",
         y="duration_plot",
         hue="state",
-        order=order_dur,
+        order=order_dur_exp,
         hue_order=["Rest", "Task"],
         linewidth=1,
         dodge=True,
@@ -388,10 +412,10 @@
     annotate_medians_by_geometry(
         ax=ax,
         df_long=df_dur,  # the DF you plotted
-        x_col="task",
+        x_col="experiment",
         hue_col="state",
         y_col="duration_plot",  # the epsilon-clipped column you used for plotting
-        x_order=order_dur,
+        x_order=order_dur_exp,
         hue_order=["Rest", "Task"],
         fmt="{:.0f}",
         y_nudge_factor=1.08,  # bump if labels sit on the line in log-space
@@ -414,7 +438,7 @@
     )
     ax.legend(handles_clean, labels_clean, title="", frameon=True, loc="upper right")
 
-    ax.set_xlabel("Task paradigm")
+    ax.set_xlabel("Experiment")
     ax.set_ylabel("Duration (sec, log scale)")
 
     for label in ax.get_xticklabels():
@@ -439,31 +463,35 @@
     order_oi, stats_oi = order_by_median_dict(OI_all, reverse=True)
     df_oi = as_long_df(OI_all, "OI_avg")
     df_oi = df_oi[df_oi["task"].isin(order_oi)]
-    df_oi["task"] = pd.Categorical(df_oi["task"], categories=order_oi, ordered=True)
+    order_oi_exp = [task_to_experiment[task] for task in order_oi]
+    df_oi["experiment"] = df_oi["task"].map(task_to_experiment)
+    df_oi["experiment"] = pd.Categorical(
+        df_oi["experiment"], categories=order_oi_exp, ordered=True
+    )
 
     fig_w = max(15, 15 / 30 * len(order_oi))
     plt.figure(figsize=(fig_w, 6))
 
     ax = sns.boxplot(
         data=df_oi,
-        x="task",
+        x="experiment",
         y="OI_avg",
-        order=order_oi,
+        order=order_oi_exp,
         width=0.6,
         linewidth=1,
         showfliers=False,
     )
 
-    ax.set_xlabel("Task paradigm")
+    ax.set_xlabel("Experiment")
     ax.set_ylabel("Optimality Index")
 
     # annotate medians
     annotate_medians_single_boxplot(
         ax,
         df_oi,
-        x_col="task",
+        x_col="experiment",
         y_col="OI_avg",
-        order=order_oi,
+        order=order_oi_exp,
         fmt="{:.2f}",
         box_alpha=0.6,
     )

From c232ae497ac16bbcb1806cefe0acd15e4dfafe63 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 22 Mar 2026 14:00:47 -0400
Subject: [PATCH 380/401] minor in OI

---
 .../performance_factor.py                      |  2 +-
 .../task_timing_stats.py                       | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 492fb93..9d30c21 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -28,7 +28,7 @@
 TIMING_FEATURES = [
     "task_ratio_avg",
     "transition_freq_avg",
-    "OI_avg",
+    "OI_median",
     "rest_durations_median",
     "task_durations_median",
     "rest_durations_iqr",
diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
index daeb413..1cabc00 100644
--- a/task_dFC/multi_dataset_analysis/task_timing_stats.py
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -89,7 +89,7 @@
         "task_durations_median": [],
         "rest_durations_iqr": [],
         "task_durations_iqr": [],
-        "OI_avg": [],
+        "OI_median": [],
     }
     for dataset in DATASETS:
 
@@ -212,7 +212,7 @@
                     iqr_task = q75_task - q25_task
                     DATA["rest_durations_iqr"].append(iqr_rest)
                     DATA["task_durations_iqr"].append(iqr_task)
-                    DATA["OI_avg"].append(np.nanmean(OI_run))
+                    DATA["OI_median"].append(np.nanmedian(OI_run))
 
     np.save(f"{output_root}/task_timing_stats_{simul_or_real}.npy", DATA)
 
@@ -479,11 +479,23 @@
         order=order_oi_exp,
         width=0.6,
         linewidth=1,
-        showfliers=False,
+        showfliers=True,
+        flierprops={
+            "marker": "o",
+            "markersize": 3.5,
+            "alpha": 0.65,
+            "markerfacecolor": "#444444",
+            "markeredgecolor": "#444444",
+        },
     )
 
     ax.set_xlabel("Experiment")
     ax.set_ylabel("Optimality Index")
+    oi_max = float(np.nanmax(df_oi["OI_avg"]))
+    oi_min = float(np.nanmin(df_oi["OI_avg"]))
+    if np.isfinite(oi_max) and np.isfinite(oi_min):
+        y_pad = max(0.03 * (oi_max - oi_min), 0.02)
+        ax.set_ylim(oi_min - y_pad, oi_max + y_pad)
 
     # annotate medians
     annotate_medians_single_boxplot(

From 992a4d5f9c371d1e560147c7c331357434392e26 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 22 Mar 2026 14:44:37 -0400
Subject: [PATCH 381/401] don'r clip OI_norm

---
 pydfc/task_utils.py                                  | 2 +-
 task_dFC/multi_dataset_analysis/task_timing_stats.py | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index d1dfd79..9cfecb2 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -792,7 +792,7 @@ def compute_optimality_index(
         OI_norm = 0.0
     else:
         OI_norm = OI / OI_ideal
-        OI_norm = max(0.0, min(1.0, float(OI_norm)))  # clamp to [0,1]
+        # OI_norm = max(0.0, min(1.0, float(OI_norm)))  # clamp to [0,1]
 
     return {
         "OI": float(OI),
diff --git a/task_dFC/multi_dataset_analysis/task_timing_stats.py b/task_dFC/multi_dataset_analysis/task_timing_stats.py
index 1cabc00..b485b73 100644
--- a/task_dFC/multi_dataset_analysis/task_timing_stats.py
+++ b/task_dFC/multi_dataset_analysis/task_timing_stats.py
@@ -479,14 +479,7 @@
         order=order_oi_exp,
         width=0.6,
         linewidth=1,
-        showfliers=True,
-        flierprops={
-            "marker": "o",
-            "markersize": 3.5,
-            "alpha": 0.65,
-            "markerfacecolor": "#444444",
-            "markeredgecolor": "#444444",
-        },
+        showfliers=False,
     )
 
     ax.set_xlabel("Experiment")

From 0ab6b46ca9c73fea3510b97615e36ef3c755b58b Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 22 Mar 2026 19:45:41 -0400
Subject: [PATCH 382/401] correct OI

---
 pydfc/task_utils.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 9cfecb2..9d47dda 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -696,10 +696,11 @@ def compute_optimality_index(
     """
 
     # -------------------------
-    # 1. Preprocess Task Timing
+    # 1. Preprocess Task Timing (remove DC)
     # -------------------------
     task_tc = np.multiply(event_labels != 0, 1).astype(float).flatten()
     T = len(task_tc)
+    task_tc_centered = task_tc - np.mean(task_tc)
 
     # -------------------------
     # 2. HRF Model
@@ -729,7 +730,7 @@ def compute_optimality_index(
     # -------------------------
     # 4. FFT-based spectra
     # -------------------------
-    design_spectrum = np.abs(np.fft.rfft(task_tc)) ** 2
+    design_spectrum = np.abs(np.fft.rfft(task_tc_centered)) ** 2
     hrf_spectrum = np.abs(np.fft.rfft(hrf_tc)) ** 2
 
     # -------------------------
@@ -738,7 +739,11 @@ def compute_optimality_index(
     if fmax is None:
         fmax = 0.5 / TR_task
 
-    mask = (freqs >= fmin) & (freqs <= fmax)
+    # Exclude DC explicitly by requiring a strictly positive lower bound.
+    min_positive_freq = 1.0 / (T * TR_task)
+    effective_fmin = max(float(fmin), min_positive_freq)
+
+    mask = (freqs >= effective_fmin) & (freqs <= fmax)
     freqs_m = freqs[mask]
     design_spectrum_m = design_spectrum[mask]
     hrf_spectrum_m = hrf_spectrum[mask]
@@ -756,9 +761,7 @@ def compute_optimality_index(
     # 7. IDEAL OI (sinusoid at peak frequency)
     # -------------------------
 
-    # Remove DC by ignoring freq = 0
-    nonzero_mask = freqs_m > 0
-    if not np.any(nonzero_mask):
+    if freqs_m.size == 0:
         # no nonzero frequencies in the band
         return {
             "OI": float(OI),
@@ -767,19 +770,25 @@ def compute_optimality_index(
             "peak_freq": 0.0,
         }
 
-    freqs_nz = freqs_m[nonzero_mask]
-    design_spectrum_nz = design_spectrum_m[nonzero_mask]
-
-    # Find dominant non-DC frequency
-    peak_idx = np.argmax(design_spectrum_nz)
-    peak_freq = freqs_nz[peak_idx]
+    # Find dominant frequency in the positive-frequency analysis band.
+    peak_idx = np.argmax(design_spectrum_m)
+    peak_freq = freqs_m[peak_idx]
 
     # Build ideal sinusoid
     t = np.arange(T) * TR_task
     ideal_tc = np.sin(2 * np.pi * peak_freq * t)
+    ideal_tc_centered = ideal_tc - np.mean(ideal_tc)
+
+    # Match total time-domain power between ideal and task before FFT.
+    task_power = float(np.sum(task_tc_centered**2))
+    ideal_power = float(np.sum(ideal_tc_centered**2))
+    if ideal_power > eps and task_power > 0.0:
+        ideal_tc_centered = ideal_tc_centered * np.sqrt(task_power / ideal_power)
+    else:
+        ideal_tc_centered = np.zeros_like(ideal_tc_centered)
 
     # FFT of ideal design
-    ideal_spectrum = np.abs(np.fft.rfft(ideal_tc)) ** 2
+    ideal_spectrum = np.abs(np.fft.rfft(ideal_tc_centered)) ** 2
     ideal_spectrum_m = ideal_spectrum[mask]
 
     # Ideal OI

From 75aabcc928be5430568bf91a53d9b7817cdde75c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 22 Mar 2026 21:11:42 -0400
Subject: [PATCH 383/401] improve OI

---
 pydfc/task_utils.py | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 9d47dda..6fb2322 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -682,8 +682,10 @@ def compute_optimality_index(
     event_labels, TR_task, TR_mri, fmin=0.0, fmax=None, alpha=1.0
 ):
     """
-    Compute a Worsley-style optimality index (OI) and normalized OI
-    relative to an ideal sinusoidal design at the dominant frequency.
+    Compute a Worsley-style optimality index (OI) and normalized OI.
+
+    OI_norm is normalized by a theoretical in-band upper bound,
+    ensuring OI_norm <= 1 without hard clipping.
 
     Returns:
     --------
@@ -758,7 +760,7 @@ def compute_optimality_index(
     OI = np.sum(design_spectrum_m * snr_weight)
 
     # -------------------------
-    # 7. IDEAL OI (sinusoid at peak frequency)
+    # 7. IDEAL OI UPPER BOUND
     # -------------------------
 
     if freqs_m.size == 0:
@@ -770,29 +772,15 @@ def compute_optimality_index(
             "peak_freq": 0.0,
         }
 
-    # Find dominant frequency in the positive-frequency analysis band.
+    # Report dominant task frequency for interpretability.
     peak_idx = np.argmax(design_spectrum_m)
     peak_freq = freqs_m[peak_idx]
 
-    # Build ideal sinusoid
-    t = np.arange(T) * TR_task
-    ideal_tc = np.sin(2 * np.pi * peak_freq * t)
-    ideal_tc_centered = ideal_tc - np.mean(ideal_tc)
-
-    # Match total time-domain power between ideal and task before FFT.
-    task_power = float(np.sum(task_tc_centered**2))
-    ideal_power = float(np.sum(ideal_tc_centered**2))
-    if ideal_power > eps and task_power > 0.0:
-        ideal_tc_centered = ideal_tc_centered * np.sqrt(task_power / ideal_power)
-    else:
-        ideal_tc_centered = np.zeros_like(ideal_tc_centered)
-
-    # FFT of ideal design
-    ideal_spectrum = np.abs(np.fft.rfft(ideal_tc_centered)) ** 2
-    ideal_spectrum_m = ideal_spectrum[mask]
-
-    # Ideal OI
-    OI_ideal = np.sum(ideal_spectrum_m * snr_weight)
+    # Theoretical in-band upper bound for weighted spectral sum:
+    #   sum(d_i * w_i) <= max(w_i) * sum(d_i), with d_i >= 0.
+    design_band_power = float(np.sum(design_spectrum_m))
+    max_weight = float(np.max(snr_weight)) if snr_weight.size > 0 else 0.0
+    OI_ideal = design_band_power * max_weight
 
     # -------------------------
     # 8. Normalized OI

From 50bb99f9e1d6644ffabb7759d3e0823b8b2ab6f5 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 23 Mar 2026 17:15:01 -0400
Subject: [PATCH 384/401] correct OI

---
 pydfc/task_utils.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/pydfc/task_utils.py b/pydfc/task_utils.py
index 6fb2322..249a489 100644
--- a/pydfc/task_utils.py
+++ b/pydfc/task_utils.py
@@ -684,8 +684,9 @@ def compute_optimality_index(
     """
     Compute a Worsley-style optimality index (OI) and normalized OI.
 
-    OI_norm is normalized by a theoretical in-band upper bound,
-    ensuring OI_norm <= 1 without hard clipping.
+    Uses HRF spectrum as the weighting term (no explicit noise model).
+    OI_norm compares the observed design to an ideal sinusoid with
+    matched in-band power placed at the optimal frequency.
 
     Returns:
     --------
@@ -698,16 +699,16 @@ def compute_optimality_index(
     """
 
     # -------------------------
-    # 1. Preprocess Task Timing (remove DC)
+    # 1. Preprocess task timing
     # -------------------------
     task_tc = np.multiply(event_labels != 0, 1).astype(float).flatten()
     T = len(task_tc)
-    task_tc_centered = task_tc - np.mean(task_tc)
 
     # -------------------------
     # 2. HRF Model
     # -------------------------
-    time_length_HRF = 32.0
+    # same length as our task
+    time_length_HRF = T * TR_task
     oversampling = TR_mri / TR_task
 
     hrf_tc = glm.first_level.spm_hrf(
@@ -726,13 +727,10 @@ def compute_optimality_index(
     # -------------------------
     freqs = np.fft.rfftfreq(T, d=TR_task)
 
-    # simple 1/f^alpha noise
-    noise_psd = (freqs + 1e-6) ** (-alpha)
-
     # -------------------------
     # 4. FFT-based spectra
     # -------------------------
-    design_spectrum = np.abs(np.fft.rfft(task_tc_centered)) ** 2
+    design_spectrum = np.abs(np.fft.rfft(task_tc)) ** 2
     hrf_spectrum = np.abs(np.fft.rfft(hrf_tc)) ** 2
 
     # -------------------------
@@ -741,18 +739,13 @@ def compute_optimality_index(
     if fmax is None:
         fmax = 0.5 / TR_task
 
-    # Exclude DC explicitly by requiring a strictly positive lower bound.
-    min_positive_freq = 1.0 / (T * TR_task)
-    effective_fmin = max(float(fmin), min_positive_freq)
-
-    mask = (freqs >= effective_fmin) & (freqs <= fmax)
+    mask = (freqs >= float(fmin)) & (freqs <= fmax)
     freqs_m = freqs[mask]
     design_spectrum_m = design_spectrum[mask]
     hrf_spectrum_m = hrf_spectrum[mask]
-    noise_psd_m = noise_psd[mask]
 
     eps = 1e-12
-    snr_weight = hrf_spectrum_m / (noise_psd_m + eps)
+    snr_weight = hrf_spectrum_m
 
     # -------------------------
     # 6. ORIGINAL (TASK) OI
@@ -789,7 +782,6 @@ def compute_optimality_index(
         OI_norm = 0.0
     else:
         OI_norm = OI / OI_ideal
-        # OI_norm = max(0.0, min(1.0, float(OI_norm)))  # clamp to [0,1]
 
     return {
         "OI": float(OI),

From 973b0af6c630c6c1abe6c9c78ebeda0d545ec0b8 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 11 Apr 2026 19:54:22 -0400
Subject: [PATCH 385/401] add tsnr script

---
 task_dFC/multi_dataset_analysis/tsnr.py | 175 ++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 task_dFC/multi_dataset_analysis/tsnr.py

diff --git a/task_dFC/multi_dataset_analysis/tsnr.py b/task_dFC/multi_dataset_analysis/tsnr.py
new file mode 100644
index 0000000..213478d
--- /dev/null
+++ b/task_dFC/multi_dataset_analysis/tsnr.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+import argparse
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from helper_functions import (
+    annotate_medians_single_boxplot,
+    build_experiment_display_info,
+    order_by_median_dict,
+    setup_pub_style,
+)
+
+
+def _load_and_filter_tsnr_df(tsv_path: str) -> pd.DataFrame:
+    df = pd.read_csv(tsv_path, sep="\t", dtype=str)
+
+    # Make sure expected columns exist
+    required_cols = {"dataset", "sub", "ses", "task", "run", "tsnr_median", "error"}
+    missing = required_cols - set(df.columns)
+    if missing:
+        raise ValueError(f"Missing required columns in TSV: {sorted(missing)}")
+
+    # Normalize missing values
+    df = df.fillna("")
+
+    # Keep only rows without errors
+    df = df[df["error"].astype(str).str.strip() == ""].copy()
+
+    # Keep only the desired session for multi-session datasets
+    # ds005038 -> keep ses == "pre"
+    # ds003823 -> keep ses == "post"
+    mask_ds005038 = df["dataset"] == "ds005038"
+    mask_ds003823 = df["dataset"] == "ds003823"
+
+    df = df[
+        (~mask_ds005038 | (df["ses"] == "pre")) & (~mask_ds003823 | (df["ses"] == "post"))
+    ].copy()
+
+    # Convert tSNR median to numeric
+    df["tsnr_median"] = pd.to_numeric(df["tsnr_median"], errors="coerce")
+
+    # Drop rows where tsnr_median could not be parsed
+    df = df[df["tsnr_median"].notna()].copy()
+
+    return df
+
+
+def build_grouped_tsnr_summary(tsv_path: str) -> Path:
+    tsv_path = Path(tsv_path).resolve()
+    out_path = tsv_path.parent / "tsnr_summary_grouped.tsv"
+
+    df = _load_and_filter_tsnr_df(str(tsv_path))
+
+    # Average over subjects for each dataset/task/run
+    out_df = (
+        df.groupby(["dataset", "task", "run"], as_index=False)["tsnr_median"]
+        .mean()
+        .rename(columns={"tsnr_median": "median_tsnr_avg_over_subjects"})
+    )
+
+    # Append prefixes
+    out_df["task"] = "task-" + out_df["task"].astype(str)
+
+    def format_run(x):
+        if pd.isna(x) or str(x).strip() == "":
+            return None
+        return f"run-{x}"
+
+    out_df["run"] = out_df["run"].apply(format_run)
+
+    # Reorder columns exactly as requested
+    out_df = out_df[["dataset", "run", "task", "median_tsnr_avg_over_subjects"]]
+
+    # Optional: round nicely
+    out_df["median_tsnr_avg_over_subjects"] = out_df[
+        "median_tsnr_avg_over_subjects"
+    ].round(2)
+
+    # Save in same directory
+    out_df.to_csv(out_path, sep="\t", index=False)
+
+    return out_path
+
+
+def build_tsnr_distribution_figure(tsv_path: str) -> Path:
+    tsv_path = Path(tsv_path).resolve()
+    fig_path = tsv_path.parent / "tsnr_median_distribution_by_exp.png"
+
+    df = _load_and_filter_tsnr_df(str(tsv_path))
+    if df.empty:
+        raise ValueError("No valid tSNR rows available to plot after filtering.")
+
+    task_to_values = df.groupby("task")["tsnr_median"].apply(list).to_dict()
+    if not task_to_values:
+        raise ValueError("No task-wise tSNR values found for plotting.")
+
+    tasks_present = sorted(task_to_values.keys())
+    _, task_to_experiment, _, _ = build_experiment_display_info(
+        tasks_iterable=tasks_present,
+        task_reference_order=tasks_present,
+        simul_or_real="real",
+    )
+
+    order_task, _ = order_by_median_dict(task_to_values, reverse=True)
+    order_exp = [task_to_experiment[t] for t in order_task]
+
+    df_plot = df.copy()
+    df_plot["experiment"] = df_plot["task"].map(task_to_experiment)
+    df_plot = df_plot[df_plot["task"].isin(order_task)].copy()
+    df_plot["experiment"] = pd.Categorical(
+        df_plot["experiment"], categories=order_exp, ordered=True
+    )
+
+    setup_pub_style()
+    sns.set_theme(context="paper", style="darkgrid")
+
+    fig_w = max(14, 14 / 30 * len(order_exp))
+    plt.figure(figsize=(fig_w, 6))
+    ax = sns.boxplot(
+        data=df_plot,
+        x="experiment",
+        y="tsnr_median",
+        order=order_exp,
+        width=0.6,
+        linewidth=1,
+        showfliers=False,
+    )
+
+    annotate_medians_single_boxplot(
+        ax,
+        df_plot,
+        x_col="experiment",
+        y_col="tsnr_median",
+        order=order_exp,
+        fmt="{:.1f}",
+        box_alpha=0.6,
+    )
+
+    ax.set_xlabel("Experiment")
+    ax.set_ylabel("tSNR median")
+    for label in ax.get_xticklabels():
+        label.set_rotation(65)
+        label.set_horizontalalignment("right")
+        label.set_fontweight("bold")
+
+    plt.tight_layout()
+    plt.savefig(fig_path, bbox_inches="tight", pad_inches=0.1, dpi=500)
+    plt.close()
+
+    return fig_path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Build a grouped TSV from tsnr_summary.tsv and create a figure showing "
+            "tSNR median distributions per experiment (EXP)."
+        )
+    )
+    parser.add_argument(
+        "tsnr_summary_tsv",
+        help="Path to tsnr_summary.tsv",
+    )
+    args = parser.parse_args()
+
+    out_path = build_grouped_tsnr_summary(args.tsnr_summary_tsv)
+    fig_path = build_tsnr_distribution_figure(args.tsnr_summary_tsv)
+    print(f"[DONE] Wrote grouped TSV to: {out_path}")
+    print(f"[DONE] Wrote figure to: {fig_path}")
+
+
+if __name__ == "__main__":
+    main()

From 66fc9a2481528497b2169f2dc567ae8625768285 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 11 Apr 2026 20:39:40 -0400
Subject: [PATCH 386/401] fix bug

---
 task_dFC/multi_dataset_analysis/tsnr.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/task_dFC/multi_dataset_analysis/tsnr.py b/task_dFC/multi_dataset_analysis/tsnr.py
index 213478d..d61de5a 100644
--- a/task_dFC/multi_dataset_analysis/tsnr.py
+++ b/task_dFC/multi_dataset_analysis/tsnr.py
@@ -8,6 +8,8 @@
 from helper_functions import (
     annotate_medians_single_boxplot,
     build_experiment_display_info,
+    canon_task,
+    get_default_experiment_name_map,
     order_by_median_dict,
     setup_pub_style,
 )
@@ -97,6 +99,18 @@ def build_tsnr_distribution_figure(tsv_path: str) -> Path:
         raise ValueError("No task-wise tSNR values found for plotting.")
 
     tasks_present = sorted(task_to_values.keys())
+    known_tasks = set(get_default_experiment_name_map("real").keys())
+    unknown_tasks = sorted(
+        [task for task in tasks_present if canon_task(task) not in known_tasks]
+    )
+    if unknown_tasks:
+        unknown_str = ", ".join(unknown_tasks)
+        raise ValueError(
+            "Found task(s) not mapped to EXP labels in real-data mapping: "
+            f"{unknown_str}. Remove these tasks from input TSV or add them to "
+            "DEFAULT_EXPERIMENT_NAME_MAP['real'] in helper_functions.py."
+        )
+
     _, task_to_experiment, _, _ = build_experiment_display_info(
         tasks_iterable=tasks_present,
         task_reference_order=tasks_present,

From e808c0190fd4954b2152aadcfd96d08c0b32f2ee Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 11 Apr 2026 21:44:48 -0400
Subject: [PATCH 387/401] update embed visual to include PCA and PLS

---
 .../LE_embedding_visualization.py             | 138 ++++++++++--------
 1 file changed, 75 insertions(+), 63 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
index 783c1a6..fd20315 100644
--- a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
+++ b/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
@@ -5,11 +5,13 @@
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import numpy as np
+from sklearn.decomposition import PCA
 from sklearn.metrics import silhouette_score
 
 from pydfc.ml_utils import (
+    LE_transform,
+    PLSEmbedder,
     dFC_feature_extraction,
-    embed_dFC_features,
     find_available_subjects,
     process_SB_features,
 )
@@ -147,70 +149,80 @@
                             # embed the features
                             # n_components = "auto"
                             n_components = 3
-                            X_embedded, _ = embed_dFC_features(
-                                train_subjects=SUBJECTS,
-                                test_subjects=[],
-                                X_train=X,
-                                X_test=None,
-                                y_train=y,
-                                y_test=None,
-                                subj_label_train=subj_label,
-                                subj_label_test=None,
-                                embedding="PCA",
-                                n_components=n_components,
-                                n_neighbors_LE=125,
-                                LE_embedding_method="embed+procrustes",
-                            )
-                            # X_embedded = TSNE(n_components=n_components, learning_rate='auto', init='random', perplexity=125, metric="correlation").fit_transform(X)
-                            print(silhouette_score(X_embedded, y))
-                            print(X_embedded.shape)
-
-                            # plot
-                            # ---- publication style (light touch) ----
-                            mpl.rcParams.update(
-                                {
-                                    "legend.fontsize": 10,
-                                    "axes.linewidth": 0.9,
-                                    "pdf.fonttype": 42,
-                                    "ps.fonttype": 42,  # keep text as text in PDF/SVG
-                                    "savefig.bbox": "tight",
-                                    "savefig.dpi": 300,
-                                    "figure.dpi": 150,
-                                }
-                            )
-                            fig = plt.figure(figsize=(7, 7))
-                            ax = fig.add_subplot(111, projection="3d")
-
-                            colors = ("#B1B1B1", "#2F5BD3")
-
-                            for label in np.unique(y):
-                                ax.scatter(
-                                    X_embedded[y == label, 0],
-                                    X_embedded[y == label, 1],
-                                    X_embedded[y == label, 2],
-                                    label=["rest", "task"][label],
-                                    s=50,
-                                    c=[colors[label]],
-                                    edgecolors="#202020",
-                                    linewidths=0.25,
-                                    depthshade=False,
+                            for embedding_method in ["PCA", "PLS", "LE"]:
+                                if embedding_method == "PCA":
+                                    X_embedded = PCA(
+                                        n_components=n_components,
+                                        whiten=False,
+                                        svd_solver="full",
+                                        random_state=0,
+                                    ).fit_transform(X)
+                                elif embedding_method == "PLS":
+                                    X_embedded = (
+                                        PLSEmbedder(
+                                            n_components=n_components, scale=False
+                                        )
+                                        .fit(X, y)
+                                        .transform(X)
+                                    )
+                                elif embedding_method == "LE":
+                                    X_embedded = LE_transform(
+                                        X,
+                                        n_components=n_components,
+                                        n_neighbors=125,
+                                        distance_metric="correlation",
+                                    )
+
+                                # X_embedded = TSNE(n_components=n_components, learning_rate='auto', init='random', perplexity=125, metric="correlation").fit_transform(X)
+                                print(silhouette_score(X_embedded, y))
+                                print(X_embedded.shape)
+
+                                # plot
+                                # ---- publication style (light touch) ----
+                                mpl.rcParams.update(
+                                    {
+                                        "legend.fontsize": 10,
+                                        "axes.linewidth": 0.9,
+                                        "pdf.fonttype": 42,
+                                        "ps.fonttype": 42,  # keep text as text in PDF/SVG
+                                        "savefig.bbox": "tight",
+                                        "savefig.dpi": 300,
+                                        "figure.dpi": 150,
+                                    }
+                                )
+                                fig = plt.figure(figsize=(7, 7))
+                                ax = fig.add_subplot(111, projection="3d")
+
+                                colors = ("#B1B1B1", "#2F5BD3")
+
+                                for label in np.unique(y):
+                                    ax.scatter(
+                                        X_embedded[y == label, 0],
+                                        X_embedded[y == label, 1],
+                                        X_embedded[y == label, 2],
+                                        label=["rest", "task"][label],
+                                        s=50,
+                                        c=[colors[label]],
+                                        edgecolors="#202020",
+                                        linewidths=0.25,
+                                        depthshade=False,
+                                    )
+                                plt.legend()
+
+                                # remove tick labels
+                                ax.set_xticklabels([])
+                                ax.set_yticklabels([])
+                                ax.set_zticklabels([])
+
+                                plt.savefig(
+                                    f"{output_root}/{embedding_method}_embed_{task}_{measure_name}.png",
+                                    dpi=fig_dpi,
+                                    bbox_inches=fig_bbox_inches,
+                                    pad_inches=fig_pad,
+                                    format=save_fig_format,
                                 )
-                            plt.legend()
-
-                            # remove tick labels
-                            ax.set_xticklabels([])
-                            ax.set_yticklabels([])
-                            ax.set_zticklabels([])
-
-                            plt.savefig(
-                                f"{output_root}/LE_embed_{task}_{measure_name}.png",
-                                dpi=fig_dpi,
-                                bbox_inches=fig_bbox_inches,
-                                pad_inches=fig_pad,
-                                format=save_fig_format,
-                            )
 
-                            plt.close()
+                                plt.close()
                         except Exception as e:
                             print(
                                 f"Error processing task {task}, dFC_id {dFC_id}, session {session}, run {run}: {e}"

From 3248d7e4a37c874fb32f9007a474ed048ab30622 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 11 Apr 2026 23:31:23 -0400
Subject: [PATCH 388/401] minor

---
 ...LE_embedding_visualization.py => embedding_visualization.py} | 0
 task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh       | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename task_dFC/multi_dataset_analysis/{LE_embedding_visualization.py => embedding_visualization.py} (100%)

diff --git a/task_dFC/multi_dataset_analysis/LE_embedding_visualization.py b/task_dFC/multi_dataset_analysis/embedding_visualization.py
similarity index 100%
rename from task_dFC/multi_dataset_analysis/LE_embedding_visualization.py
rename to task_dFC/multi_dataset_analysis/embedding_visualization.py
diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index e4d975e..8c77aef 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -30,7 +30,7 @@ if [ ! -f "$SCRIPT_PATH" ]; then
 fi
 
 case "$SCRIPT_NAME" in
-  performance_predict.py | performance_factor.py | ml_results.py | dfc_visualization.py | LE_embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py | cohensd.py)
+  performance_predict.py | performance_factor.py | ml_results.py | dfc_visualization.py | embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py | cohensd.py)
     python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
     ;;
   *)

From b00d58f34f0a6c4b7680d11ff2dd67fb7bd6d716 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 12 Apr 2026 10:44:54 -0400
Subject: [PATCH 389/401] update sample_matrix_visual to use PLS

---
 .../sample_matrix_visualization.py            | 202 ++++++++++--------
 1 file changed, 109 insertions(+), 93 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index f3e9800..3db6421 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -6,10 +6,12 @@
 import numpy as np
 
 from pydfc.ml_utils import (
+    PLSEmbedder,
     dFC_feature_extraction,
-    embed_dFC_features,
     find_available_subjects,
     process_SB_features,
+    select_num_components_binary_groupcv,
+    subject_center,
 )
 
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
@@ -22,7 +24,6 @@
 normalize_dFC = False
 FCS_proba_for_SB = True
 train_test_ratio = 0.8
-embedding = "LE"
 
 if use_raw_features:
     raw_or_embedded = "_raw"
@@ -196,36 +197,51 @@
                             X_test_embedded = process_SB_features(
                                 X=X_test, measure_name=measure_name
                             )
-                        # else:
-                        #     # embed dFC features
-                        #     try:
-                        #         X_train_embedded, X_test_embedded = embed_dFC_features(
-                        #             train_subjects=train_subjects,
-                        #             test_subjects=test_subjects,
-                        #             X_train=X_train,
-                        #             X_test=X_test,
-                        #             y_train=y_train,
-                        #             y_test=y_test,
-                        #             subj_label_train=subj_label_train,
-                        #             subj_label_test=subj_label_test,
-                        #             embedding=embedding,
-                        #             n_components="auto",
-                        #             n_neighbors_LE=125,
-                        #             LE_embedding_method="embed+procrustes",
-                        #             measure_is_state_based=measure_is_state_based,
-                        #         )
-                        #         assert (
-                        #             X_train_embedded.shape[0] == y_train.shape[0]
-                        #         ), "Number of samples do not match."
-                        #         assert (
-                        #             X_test_embedded.shape[0] == y_test.shape[0]
-                        #         ), "Number of samples do not match."
-                        #     except Exception as e:
-                        #         print(
-                        #             f"Error in embedding dFC features with {embedding}: {e}"
-                        #         )
-                        #         X_train_embedded = None
-                        #         X_test_embedded = None
+                        # center the data by subject before embedding to remove subject effects
+                        # separately for train and test sets to avoid data leakage
+                        # for both state-based and state-free methods
+                        X_train_embedded = subject_center(
+                            X_train_embedded, subj_label_train, mode="demean"
+                        )
+                        X_test_embedded = subject_center(
+                            X_test_embedded, subj_label_test, mode="demean"
+                        )
+                        if not measure_is_state_based:
+                            # embed dFC features using PLS regression, which is a supervised embedding method that finds the components that best explain the variance in the labels
+                            best_n, _ = select_num_components_binary_groupcv(
+                                X=X_train_embedded,
+                                y=y_train,
+                                groups=subj_label_train,
+                                embedding_method="PLS",
+                                n_list=[
+                                    2,
+                                    3,
+                                    4,
+                                    5,
+                                    10,
+                                    15,
+                                    20,
+                                    25,
+                                    30,
+                                    40,
+                                    50,
+                                ],  # you can adjust this range based on your data
+                                cv=5,  # more stable
+                            )
+                            pls = PLSEmbedder(n_components=best_n, scale=True)
+                            # fit on train set
+                            X_train_embedded = pls.fit_transform(
+                                X_train_embedded, y_train
+                            )
+                            assert (
+                                X_train_embedded.shape[0] == y_train.shape[0]
+                            ), "Number of samples do not match."
+                            # only transform test set
+                            if X_test is not None:
+                                X_test_embedded = pls.transform(X_test_embedded)
+                                assert (
+                                    X_test_embedded.shape[0] == y_test.shape[0]
+                                ), "Number of samples do not match."
 
                         assert (
                             task not in DATA
@@ -233,8 +249,8 @@
                         DATA[task] = {
                             "X_train": X_train,
                             "X_test": X_test,
-                            # "X_train_embedded": X_train_embedded,
-                            # "X_test_embedded": X_test_embedded,
+                            "X_train_embedded": X_train_embedded,
+                            "X_test_embedded": X_test_embedded,
                             "y_train": y_train,
                             "y_test": y_test,
                             "subj_label_train": subj_label_train,
@@ -244,21 +260,21 @@
             # save the data
             # save each task in a separate file and name the file as the task name, measure name, and dataset name
             for task in DATA.keys():
-                # if use_raw_features:
-                #     X_train = DATA[task]["X_train"]
-                #     X_test = DATA[task]["X_test"]
-                # else:
-                #     X_train = DATA[task]["X_train_embedded"]
-                #     X_test = DATA[task]["X_test_embedded"]
-                # y_train = DATA[task]["y_train"]
-                # y_test = DATA[task]["y_test"]
-                # subj_label_train = DATA[task]["subj_label_train"]
-                # subj_label_test = DATA[task]["subj_label_test"]
-                # measure_name = DATA[task]["measure_name"]
-
-                # if X_train is None or X_test is None:
-                #     print(f"Skipping task {task} due to embedding error.")
-                #     continue
+                if use_raw_features:
+                    X_train = DATA[task]["X_train"]
+                    X_test = DATA[task]["X_test"]
+                else:
+                    X_train = DATA[task]["X_train_embedded"]
+                    X_test = DATA[task]["X_test_embedded"]
+                y_train = DATA[task]["y_train"]
+                y_test = DATA[task]["y_test"]
+                subj_label_train = DATA[task]["subj_label_train"]
+                subj_label_test = DATA[task]["subj_label_test"]
+                measure_name = DATA[task]["measure_name"]
+
+                if X_train is None or X_test is None:
+                    print(f"Skipping task {task} due to embedding error.")
+                    continue
 
                 if not os.path.exists(f"{output_root}/processed_data"):
                     os.makedirs(f"{output_root}/processed_data")
@@ -267,47 +283,47 @@
                     DATA[task],
                 )
 
-                # for group, X, y in zip(
-                #     ["train", "test"], [X_train, X_test], [y_train, y_test]
-                # ):
-                #     # if the folder does not exist, create it
-                #     if not os.path.exists(f"{output_root}/{measure_name}"):
-                #         os.makedirs(f"{output_root}/{measure_name}")
-
-                #     # A) Unsorted (your first vis, but rotated so time is horizontal)
-                #     plot_samples_features(
-                #         X,
-                #         y,
-                #         sample_order="original",
-                #         feature_order="original",
-                #         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
-                #         show=False,
-                #     )
-
-                #     # B) Label-sorted (your third vis)
-                #     plot_samples_features(
-                #         X,
-                #         y,
-                #         sample_order="label",
-                #         feature_order="original",
-                #         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
-                #         show=False,
-                #     )
-
-                #     # C) clustering
-                #     plot_samples_features(
-                #         X,
-                #         y,
-                #         sample_order="cluster",
-                #         feature_order="original",
-                #         save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_clustered-samples_{task}_{group}{raw_or_embedded}.png",
-                #         show=False,
-                #     )
-
-                #     save_scalar_colorbar(
-                #         cmap="coolwarm",
-                #         vmin=-1.6,
-                #         vmax=1.6,  # use the same V_RANGE you use in plots
-                #         label="z-scored feature value",
-                #         filename=f"{output_root}/zscore_colorbar.png",
-                #     )
+                for group, X, y in zip(
+                    ["train", "test"], [X_train, X_test], [y_train, y_test]
+                ):
+                    # if the folder does not exist, create it
+                    if not os.path.exists(f"{output_root}/{measure_name}"):
+                        os.makedirs(f"{output_root}/{measure_name}")
+
+                    # A) Unsorted (your first vis, but rotated so time is horizontal)
+                    plot_samples_features(
+                        X,
+                        y,
+                        sample_order="original",
+                        feature_order="original",
+                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_unsorted_{task}_{group}{raw_or_embedded}.png",
+                        show=False,
+                    )
+
+                    # B) Label-sorted (your third vis)
+                    plot_samples_features(
+                        X,
+                        y,
+                        sample_order="label",
+                        feature_order="original",
+                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_sorted-label_{task}_{group}{raw_or_embedded}.png",
+                        show=False,
+                    )
+
+                    # C) clustering
+                    plot_samples_features(
+                        X,
+                        y,
+                        sample_order="cluster",
+                        feature_order="original",
+                        save_path=f"{output_root}/{measure_name}/feature-sample_{simul_or_real}_clustered-samples_{task}_{group}{raw_or_embedded}.png",
+                        show=False,
+                    )
+
+                    save_scalar_colorbar(
+                        cmap="coolwarm",
+                        vmin=-1.6,
+                        vmax=1.6,  # use the same V_RANGE you use in plots
+                        label="z-scored feature value",
+                        filename=f"{output_root}/zscore_colorbar.png",
+                    )

From f7501f3937047936221beb4654b5309ded58a4c0 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sun, 12 Apr 2026 11:09:33 -0400
Subject: [PATCH 390/401] fix bug

---
 .../sample_matrix_visualization.py            | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 3db6421..0584b53 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -191,25 +191,25 @@
                             raise ValueError(f"Unknown measure name: {measure_name}")
 
                         if measure_is_state_based:
-                            X_train_embedded = process_SB_features(
+                            X_train = process_SB_features(
                                 X=X_train, measure_name=measure_name
                             )
-                            X_test_embedded = process_SB_features(
+                            X_test = process_SB_features(
                                 X=X_test, measure_name=measure_name
                             )
                         # center the data by subject before embedding to remove subject effects
                         # separately for train and test sets to avoid data leakage
                         # for both state-based and state-free methods
-                        X_train_embedded = subject_center(
-                            X_train_embedded, subj_label_train, mode="demean"
+                        X_train_centered = subject_center(
+                            X_train, subj_label_train, mode="demean"
                         )
-                        X_test_embedded = subject_center(
-                            X_test_embedded, subj_label_test, mode="demean"
+                        X_test_centered = subject_center(
+                            X_test, subj_label_test, mode="demean"
                         )
                         if not measure_is_state_based:
                             # embed dFC features using PLS regression, which is a supervised embedding method that finds the components that best explain the variance in the labels
                             best_n, _ = select_num_components_binary_groupcv(
-                                X=X_train_embedded,
+                                X=X_train_centered,
                                 y=y_train,
                                 groups=subj_label_train,
                                 embedding_method="PLS",
@@ -231,17 +231,23 @@
                             pls = PLSEmbedder(n_components=best_n, scale=True)
                             # fit on train set
                             X_train_embedded = pls.fit_transform(
-                                X_train_embedded, y_train
+                                X_train_centered, y_train
                             )
                             assert (
                                 X_train_embedded.shape[0] == y_train.shape[0]
                             ), "Number of samples do not match."
                             # only transform test set
                             if X_test is not None:
-                                X_test_embedded = pls.transform(X_test_embedded)
+                                X_test_embedded = pls.transform(X_test_centered)
                                 assert (
                                     X_test_embedded.shape[0] == y_test.shape[0]
                                 ), "Number of samples do not match."
+                            else:
+                                X_test_embedded = None
+                        else:
+                            # for state-based measures, we skip the embedding step and just use the original features
+                            X_train_embedded = X_train
+                            X_test_embedded = X_test
 
                         assert (
                             task not in DATA

From 9c947b62bfbd23de91e675ef06d79652690ccd50 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 15 Apr 2026 17:14:09 -0400
Subject: [PATCH 391/401] improve performance_factor visualization

---
 .../performance_factor.py                     | 238 +++++++++++++-----
 1 file changed, 178 insertions(+), 60 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 9d30c21..9f8dfde 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -555,12 +555,17 @@ def build_top_bottom_profile_table(df, quantile=TOP_BOTTOM_QUANTILE):
     return profile_df
 
 
-def plot_top_bottom_profile(profile_df, out_dir, simul_or_real):
+def plot_top_bottom_profile(profile_df, out_dir, simul_or_real, factor_label_map=None):
     valid_df = profile_df.dropna(subset=["cohens_d"]).copy()
     assert (
         not valid_df.empty
     ), "No valid Cohen's d values available for top-vs-bottom profile plot"
 
+    factor_label_map = factor_label_map or {}
+    valid_df["factor_display"] = (
+        valid_df["factor"].map(factor_label_map).fillna(valid_df["factor"].astype(str))
+    )
+
     factor_order = (
         valid_df.groupby("factor", observed=True)["abs_cohens_d"]
         .max()
@@ -568,28 +573,98 @@ def plot_top_bottom_profile(profile_df, out_dir, simul_or_real):
         .index.tolist()
     )
     valid_df = valid_df.sort_values(["factor", "dFC assessment method"])
-    valid_df["factor"] = pd.Categorical(
-        valid_df["factor"], categories=factor_order, ordered=True
+    factor_display_order = [
+        factor_label_map.get(factor, factor) for factor in factor_order
+    ]
+    valid_df["factor_display"] = pd.Categorical(
+        valid_df["factor_display"], categories=factor_display_order, ordered=True
     )
 
-    height = max(6.0, 0.55 * len(factor_order))
-    figure, ax = plt.subplots(figsize=(12.5, height))
-    sns.scatterplot(
+    method_order = sorted(valid_df["dFC assessment method"].astype(str).unique())
+    vivid_palette = sns.color_palette("tab10", n_colors=len(method_order))
+    method_palette = {m: vivid_palette[i] for i, m in enumerate(method_order)}
+
+    # More generous height keeps rows legible when many factors are shown.
+    height = max(7.0, 0.72 * len(factor_order))
+    width = 16.5
+    figure, ax = plt.subplots(figsize=(width, height))
+
+    # Alternating row bands make it easier to track each factor across methods.
+    for idx, factor in enumerate(factor_order):
+        if idx % 2 == 0:
+            ax.axhspan(idx - 0.5, idx + 0.5, color="#F4F6FA", alpha=0.75, zorder=0)
+
+    sns.stripplot(
         data=valid_df,
         x="cohens_d",
-        y="factor",
+        y="factor_display",
         hue="dFC assessment method",
-        s=85,
+        order=factor_display_order,
+        hue_order=method_order,
+        palette=method_palette,
+        dodge=True,
+        jitter=0.08,
+        size=8.2,
+        linewidth=0.85,
+        edgecolor="white",
+        alpha=0.98,
         ax=ax,
     )
 
-    ax.axvline(0.0, color="#333333", linestyle="--", linewidth=1.1)
+    max_abs = float(np.nanmax(np.abs(valid_df["cohens_d"].values)))
+    x_pad = max(0.15, 0.12 * max_abs)
+    x_lim = max_abs + x_pad
+
+    ax.axvline(0.0, color="#1F1F1F", linestyle="--", linewidth=1.5, zorder=3)
+    ax.set_xlim(-x_lim, x_lim)
     ax.set_xlabel("Effect size (Cohen's d): Top 20% vs Bottom 20% within method")
     ax.set_ylabel("Factor")
-    ax.grid(True, axis="x", which="major", linestyle="-", alpha=0.35)
-    ax.legend(title="dFC assessment method", frameon=True)
+
+    ax.tick_params(axis="x", labelsize=12)
+    ax.tick_params(axis="y", labelsize=12)
+
+    # Keep tick labels readable but not too sparse using a "nice number" step.
+    span = 2.0 * x_lim
+    target_ticks = 11  # aim for ~11 major ticks across full span
+    raw_major_step = span / (target_ticks - 1)
+    if raw_major_step <= 0:
+        major_step = 0.5
+    else:
+        exponent = np.floor(np.log10(raw_major_step))
+        base = 10.0**exponent
+        fraction = raw_major_step / base
+        if fraction <= 1.0:
+            nice_fraction = 1.0
+        elif fraction <= 2.0:
+            nice_fraction = 2.0
+        elif fraction <= 2.5:
+            nice_fraction = 2.5
+        elif fraction <= 5.0:
+            nice_fraction = 5.0
+        else:
+            nice_fraction = 10.0
+        major_step = nice_fraction * base
+
+    minor_step = major_step / 2.0
+    ax.xaxis.set_major_locator(MultipleLocator(major_step))
+    ax.xaxis.set_minor_locator(MultipleLocator(minor_step))
+    ax.grid(True, axis="x", which="major", linestyle="-", linewidth=1.0, alpha=0.35)
+    ax.grid(True, axis="x", which="minor", linestyle="--", linewidth=0.8, alpha=0.2)
+
+    legend = ax.legend(
+        title="dFC assessment method",
+        frameon=True,
+        loc="upper left",
+        bbox_to_anchor=(1.01, 1.0),
+        borderaxespad=0,
+    )
+    if legend is not None:
+        legend.get_title().set_fontsize(12)
+        for txt in legend.get_texts():
+            txt.set_fontsize(11)
+
     sns.despine(ax=ax, top=True, right=True)
-    figure.tight_layout()
+    figure.tight_layout(rect=[0.18, 0, 0.83, 1])
 
     fig_path = f"{out_dir}/performance_top_bottom_profile_{simul_or_real}.png"
     savefig_pub(fig_path)
@@ -655,14 +730,16 @@ def build_rdoc_performance_group_table(df, simul_or_real):
     return summary_long, count_table, proportion_table
 
 
-def plot_rdoc_performance_group_stacked_bar(proportion_table, out_dir, simul_or_real):
-    width = max(9.0, 1.35 * len(proportion_table.index))
-    figure, ax = plt.subplots(figsize=(width, 6.6))
+def plot_rdoc_performance_group_stacked_bar(
+    proportion_table, out_dir, simul_or_real, x_label="RDoC domain"
+):
+    width = max(10.0, 1.6 * len(proportion_table.index))
+    figure, ax = plt.subplots(figsize=(width, 7.2))
 
     palette = {
-        "Low": "#C44E52",
-        "Medium": "#DDCF84",
-        "High": "#4C9F70",
+        "Low": "#D1495B",
+        "Medium": "#F4D35E",
+        "High": "#2A9D8F",
     }
     proportion_pct = proportion_table.mul(100.0)
     bottom = np.zeros(len(proportion_pct.index))
@@ -676,19 +753,33 @@ def plot_rdoc_performance_group_stacked_bar(proportion_table, out_dir, simul_or_
             label=label,
             color=palette[label],
             edgecolor="white",
-            linewidth=0.8,
+            linewidth=1.0,
         )
         bottom += values
 
-    ax.set_xlabel("RDoC domain")
-    ax.set_ylabel("Samples (%)")
+    for label in ax.get_xticklabels():
+        label.set_rotation(25)
+        label.set_horizontalalignment("right")
+        label.set_fontsize(12)
+        label.set_fontweight("bold")
+
+    ax.set_xlabel(x_label, fontweight="bold")
+    ax.set_ylabel("Samples (%)", fontweight="bold")
     ax.set_ylim(0, 100)
-    ax.yaxis.set_major_locator(MultipleLocator(10))
-    ax.yaxis.set_minor_locator(MultipleLocator(5))
-    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.35)
-    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.18)
-    plt.setp(ax.get_xticklabels(), rotation=25, ha="right")
-    ax.legend(title="Performance group", frameon=True)
+    ax.yaxis.set_major_locator(MultipleLocator(20))
+    ax.yaxis.set_minor_locator(MultipleLocator(10))
+    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.34)
+    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.16)
+    ax.tick_params(axis="y", labelsize=12)
+    for label in ax.get_yticklabels():
+        label.set_fontweight("bold")
+    legend = ax.legend(
+        title="Performance group", frameon=True, fontsize=11, title_fontsize=12
+    )
+    if legend is not None:
+        legend.get_title().set_fontweight("bold")
+        for txt in legend.get_texts():
+            txt.set_fontweight("bold")
     sns.despine(ax=ax, top=True, right=True)
     figure.tight_layout()
 
@@ -698,29 +789,32 @@ def plot_rdoc_performance_group_stacked_bar(proportion_table, out_dir, simul_or_
     return fig_path
 
 
-def plot_rdoc_performance_group_heatmap(proportion_table, out_dir, simul_or_real):
+def plot_rdoc_performance_group_heatmap(
+    proportion_table, out_dir, simul_or_real, x_label="Performance group"
+):
     annot_table = proportion_table.mul(100.0).applymap(lambda value: f"{value:.1f}%")
 
-    figure, ax = plt.subplots(figsize=(7.4, max(4.8, 0.7 * len(proportion_table.index))))
+    figure, ax = plt.subplots(figsize=(8.6, max(5.2, 0.82 * len(proportion_table.index))))
     heatmap = sns.heatmap(
         proportion_table.loc[:, PERFORMANCE_GROUP_LABELS],
-        cmap="YlGnBu",
+        cmap="crest",
         vmin=0.0,
         vmax=1.0,
         annot=annot_table.loc[:, PERFORMANCE_GROUP_LABELS],
         fmt="",
-        linewidths=0.7,
+        linewidths=0.9,
         linecolor="white",
-        cbar_kws={"shrink": 0.8, "pad": 0.02},
+        cbar_kws={"shrink": 0.84, "pad": 0.03},
         ax=ax,
     )
     colorbar = heatmap.collections[0].colorbar
-    colorbar.set_label("Proportion", fontweight="bold")
+    colorbar.set_label("Proportion", fontweight="bold", fontsize=12)
 
-    ax.set_xlabel("Performance group")
-    ax.set_ylabel("RDoC domain")
-    plt.setp(ax.get_xticklabels(), rotation=0)
-    plt.setp(ax.get_yticklabels(), rotation=0)
+    ax.set_xlabel(x_label, fontweight="bold")
+    ax.set_ylabel("RDoC domain", fontweight="bold")
+    plt.setp(ax.get_xticklabels(), rotation=0, fontsize=12, fontweight="bold")
+    plt.setp(ax.get_yticklabels(), rotation=0, fontsize=12, fontweight="bold")
+    ax.set_title("RDoC composition by performance group", pad=12, fontweight="bold")
     figure.tight_layout()
 
     fig_path = f"{out_dir}/performance_group_by_rdoc_heatmap_{simul_or_real}.png"
@@ -729,21 +823,26 @@ def plot_rdoc_performance_group_heatmap(proportion_table, out_dir, simul_or_real
     return fig_path
 
 
-def plot_rdoc_overall_distribution(df, out_dir, simul_or_real):
+def plot_rdoc_overall_distribution(df, out_dir, simul_or_real, x_label="RDoC domain"):
     rdoc_order = _get_present_rdoc_order(df, simul_or_real)
     assert rdoc_order, "No RDoC values found for plotting"
 
-    width = max(10, 1.3 * len(rdoc_order))
-    height = 6.5
+    width = max(12.0, 1.55 * len(rdoc_order))
+    height = 7.0
     figure, ax = plt.subplots(figsize=(width, height))
 
+    palette = sns.color_palette("Spectral", n_colors=len(rdoc_order))
+    palette_map = {rdoc: palette[i] for i, rdoc in enumerate(rdoc_order)}
+
     sns.boxplot(
         data=df,
         x="RDoC",
         y="classification_balanced_accuracy",
         order=rdoc_order,
         showfliers=False,
-        width=0.55,
+        width=0.58,
+        palette=palette_map,
+        linewidth=1.2,
         ax=ax,
     )
     sns.stripplot(
@@ -751,21 +850,26 @@ def plot_rdoc_overall_distribution(df, out_dir, simul_or_real):
         x="RDoC",
         y="classification_balanced_accuracy",
         order=rdoc_order,
-        color="#303030",
-        alpha=0.55,
-        size=3,
-        jitter=0.22,
+        palette=palette_map,
+        alpha=0.45,
+        size=3.1,
+        jitter=0.2,
         ax=ax,
     )
 
-    ax.set_xlabel("RDoC domain")
-    ax.set_ylabel("Balanced accuracy")
+    ax.set_xlabel(x_label, fontweight="bold")
+    ax.set_ylabel("Balanced accuracy", fontweight="bold")
     ax.set_ylim(0.45, 1.02)
-    plt.setp(ax.get_xticklabels(), rotation=25, ha="right")
+    plt.setp(
+        ax.get_xticklabels(), rotation=25, ha="right", fontsize=12, fontweight="bold"
+    )
     ax.yaxis.set_major_locator(MultipleLocator(0.05))
     ax.yaxis.set_minor_locator(MultipleLocator(0.025))
-    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.35)
-    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.2)
+    ax.tick_params(axis="y", labelsize=12)
+    for label in ax.get_yticklabels():
+        label.set_fontweight("bold")
+    ax.grid(True, axis="y", which="major", linestyle="-", alpha=0.34)
+    ax.grid(True, axis="y", which="minor", linestyle="--", alpha=0.18)
     sns.despine(ax=ax, top=True, right=True)
     figure.tight_layout()
 
@@ -775,7 +879,7 @@ def plot_rdoc_overall_distribution(df, out_dir, simul_or_real):
     return fig_path
 
 
-def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
+def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real, x_label="RDoC domain"):
     rdoc_order = _get_present_rdoc_order(df, simul_or_real)
     assert rdoc_order, "No RDoC values found for plotting"
 
@@ -789,13 +893,13 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
     n_methods = df["dFC assessment method"].nunique()
     # Generous per-domain width so boxes never feel cramped
     n_domains = len(rdoc_order)
-    # Each domain gets ~2.8 in; minimum figure width 18 in
-    axes_width = max(18.0, 2.8 * n_domains)
-    # Reserve ~3.5 in on the right for the legend column
-    legend_width = 3.5
+    # Each domain gets ~3.1 in; minimum figure width 20 in
+    axes_width = max(20.0, 3.1 * n_domains)
+    # Reserve more room for the legend column
+    legend_width = 4.2
     total_width = axes_width + legend_width
-    # Height: 8 in gives comfortable y-axis room; scale slightly with methods
-    height = max(8.0, 0.35 * n_methods + 6.5)
+    # Height: keep panels open and readable
+    height = max(8.5, 0.42 * n_methods + 6.8)
 
     fig_paths = []
     for _, combo in combo_df.iterrows():
@@ -810,6 +914,10 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
 
         figure, ax = plt.subplots(figsize=(total_width, height))
 
+        palette = sns.color_palette("tab10", n_colors=n_methods)
+        hue_order = sorted(sub_df["dFC assessment method"].dropna().astype(str).unique())
+        method_palette = {method: palette[i] for i, method in enumerate(hue_order)}
+
         sns.boxplot(
             data=sub_df,
             x="RDoC",
@@ -818,13 +926,15 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
             order=rdoc_order,
             showfliers=False,
             width=0.72,
-            linewidth=1.4,
+            linewidth=1.35,
+            palette=method_palette,
+            hue_order=hue_order,
             ax=ax,
         )
 
         ax.set_ylim(0.45, 1.02)
-        ax.set_xlabel("RDoC domain", labelpad=12, fontsize=14)
-        ax.set_ylabel("Balanced accuracy", labelpad=12, fontsize=14)
+        ax.set_xlabel(x_label, labelpad=12, fontsize=14, fontweight="bold")
+        ax.set_ylabel("Balanced accuracy", labelpad=12, fontsize=14, fontweight="bold")
         ax.set_title(
             f"{classifier}  |  {embedding}",
             fontweight="bold",
@@ -840,6 +950,9 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
             label.set_rotation(30)
             label.set_horizontalalignment("right")
             label.set_fontsize(13)
+            label.set_fontweight("bold")
+        for label in ax.get_yticklabels():
+            label.set_fontweight("bold")
 
         handles, labels = ax.get_legend_handles_labels()
         if handles:
@@ -854,6 +967,11 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real):
                 loc="center left",
                 bbox_to_anchor=(axes_width / total_width + 0.01, 0.5),
             )
+            if figure.legends:
+                for legend in figure.legends:
+                    legend.get_title().set_fontweight("bold")
+                    for txt in legend.get_texts():
+                        txt.set_fontweight("bold")
 
         sns.despine(ax=ax, top=True, right=True)
         # Leave right margin for the figure-level legend

From 25161ad0fc0d89f4db7ea1ae88b9c1140f38030d Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 15 Apr 2026 17:30:14 -0400
Subject: [PATCH 392/401] minor fix

---
 .../performance_factor.py                     | 41 ++++++++++++++++---
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 9f8dfde..4d3e18f 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -57,6 +57,23 @@
 TOP_BOTTOM_QUANTILE = 0.2
 PERFORMANCE_GROUP_LABELS = ["Low", "Medium", "High"]
 
+DEFAULT_FACTOR_LABEL_MAP = {
+    "task_ratio_avg": "average task ratio",
+    "task_durations_iqr": "task duration IQR",
+    "task_durations_median": "task duration median",
+    "rest_durations_iqr": "rest duration IQR",
+    "rest_durations_median": "rest duration median",
+    "OI_median": "median OI",
+    "CohensD_mean": "mean Cohen's d",
+    "CohensD_max": "max Cohen's d",
+    "transition_freq_avg": "average transition frequency",
+    "median_tsnr_avg_over_subjects": "median tSNR averaged over subjects",
+}
+
+
+def get_domain_axis_label(simul_or_real):
+    return "RDoC domain" if simul_or_real == "real" else "Simulation design category"
+
 
 def parse_args():
     helptext = """
@@ -561,7 +578,7 @@ def plot_top_bottom_profile(profile_df, out_dir, simul_or_real, factor_label_map
         not valid_df.empty
     ), "No valid Cohen's d values available for top-vs-bottom profile plot"
 
-    factor_label_map = factor_label_map or {}
+    factor_label_map = factor_label_map or DEFAULT_FACTOR_LABEL_MAP
     valid_df["factor_display"] = (
         valid_df["factor"].map(factor_label_map).fillna(valid_df["factor"].astype(str))
     )
@@ -1033,14 +1050,25 @@ def main():
     )
     profile_df.to_csv(profile_csv_path, index=False)
     profile_fig_path = plot_top_bottom_profile(
-        profile_df, paths["out_dir"], args.simul_or_real
+        profile_df,
+        paths["out_dir"],
+        args.simul_or_real,
+        factor_label_map=DEFAULT_FACTOR_LABEL_MAP,
     )
 
+    domain_x_label = get_domain_axis_label(args.simul_or_real)
+
     rdoc_overall_path = plot_rdoc_overall_distribution(
-        df, paths["out_dir"], args.simul_or_real
+        df,
+        paths["out_dir"],
+        args.simul_or_real,
+        x_label=domain_x_label,
     )
     rdoc_faceted_paths = plot_rdoc_faceted_distribution(
-        df, paths["out_dir"], args.simul_or_real
+        df,
+        paths["out_dir"],
+        args.simul_or_real,
+        x_label=domain_x_label,
     )
     rdoc_group_long_df, rdoc_group_count_table, rdoc_group_prop_table = (
         build_rdoc_performance_group_table(df, args.simul_or_real)
@@ -1050,7 +1078,10 @@ def main():
     )
     rdoc_group_long_df.to_csv(rdoc_group_csv_path, index=False)
     rdoc_group_bar_path = plot_rdoc_performance_group_stacked_bar(
-        rdoc_group_prop_table, paths["out_dir"], args.simul_or_real
+        rdoc_group_prop_table,
+        paths["out_dir"],
+        args.simul_or_real,
+        x_label=domain_x_label,
     )
     rdoc_group_heatmap_path = plot_rdoc_performance_group_heatmap(
         rdoc_group_prop_table, paths["out_dir"], args.simul_or_real

From 2ee236fb9887742e97f8c9211d731e9ed4c6a235 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Wed, 15 Apr 2026 18:35:37 -0400
Subject: [PATCH 393/401] minor improvement

---
 .../performance_factor.py                     | 72 +++++++++++++++----
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/performance_factor.py b/task_dFC/multi_dataset_analysis/performance_factor.py
index 4d3e18f..6d66d19 100644
--- a/task_dFC/multi_dataset_analysis/performance_factor.py
+++ b/task_dFC/multi_dataset_analysis/performance_factor.py
@@ -748,7 +748,11 @@ def build_rdoc_performance_group_table(df, simul_or_real):
 
 
 def plot_rdoc_performance_group_stacked_bar(
-    proportion_table, out_dir, simul_or_real, x_label="RDoC domain"
+    proportion_table,
+    out_dir,
+    simul_or_real,
+    x_label="RDoC domain",
+    count_table=None,
 ):
     width = max(10.0, 1.6 * len(proportion_table.index))
     figure, ax = plt.subplots(figsize=(width, 7.2))
@@ -763,6 +767,9 @@ def plot_rdoc_performance_group_stacked_bar(
 
     for label in PERFORMANCE_GROUP_LABELS:
         values = proportion_pct[label].to_numpy()
+        counts = None
+        if count_table is not None and label in count_table.columns:
+            counts = count_table[label].to_numpy()
         ax.bar(
             proportion_pct.index,
             values,
@@ -772,6 +779,27 @@ def plot_rdoc_performance_group_stacked_bar(
             edgecolor="white",
             linewidth=1.0,
         )
+
+        # Annotate each stacked segment with sample count.
+        if counts is not None:
+            for i, (val, cnt) in enumerate(zip(values, counts)):
+                if cnt <= 0 or val <= 0:
+                    continue
+                y = bottom[i] + 0.5 * val
+                # Skip tiny slivers to avoid clutter.
+                if val < 5.0:
+                    continue
+                ax.text(
+                    i,
+                    y,
+                    f"n={int(cnt)}",
+                    ha="center",
+                    va="center",
+                    fontsize=9,
+                    fontweight="bold",
+                    color="#1F1F1F",
+                )
+
         bottom += values
 
     for label in ax.get_xticklabels():
@@ -807,9 +835,25 @@ def plot_rdoc_performance_group_stacked_bar(
 
 
 def plot_rdoc_performance_group_heatmap(
-    proportion_table, out_dir, simul_or_real, x_label="Performance group"
+    proportion_table,
+    out_dir,
+    simul_or_real,
+    x_label="Performance group",
+    count_table=None,
 ):
-    annot_table = proportion_table.mul(100.0).applymap(lambda value: f"{value:.1f}%")
+    if count_table is not None:
+        count_view = count_table.loc[
+            proportion_table.index, PERFORMANCE_GROUP_LABELS
+        ].astype(int)
+        annot_table = proportion_table.loc[:, PERFORMANCE_GROUP_LABELS].mul(100.0)
+        annot_table = annot_table.apply(
+            lambda col: [
+                f"{pct:.1f}%\n(n={cnt})"
+                for pct, cnt in zip(col.values, count_view[col.name].values)
+            ]
+        )
+    else:
+        annot_table = proportion_table.mul(100.0).applymap(lambda value: f"{value:.1f}%")
 
     figure, ax = plt.subplots(figsize=(8.6, max(5.2, 0.82 * len(proportion_table.index))))
     heatmap = sns.heatmap(
@@ -910,10 +954,10 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real, x_label="RDoC dom
     n_methods = df["dFC assessment method"].nunique()
     # Generous per-domain width so boxes never feel cramped
     n_domains = len(rdoc_order)
-    # Each domain gets ~3.1 in; minimum figure width 20 in
-    axes_width = max(20.0, 3.1 * n_domains)
-    # Reserve more room for the legend column
-    legend_width = 4.2
+    # Each domain gets ~2.8 in; keep figure compact for manuscript layouts.
+    axes_width = max(17.0, 2.8 * n_domains)
+    # Small right margin only; legend now sits at the top-right of the full figure.
+    legend_width = 1.6
     total_width = axes_width + legend_width
     # Height: keep panels open and readable
     height = max(8.5, 0.42 * n_methods + 6.8)
@@ -981,8 +1025,8 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real, x_label="RDoC dom
                 title_fontsize=12,
                 fontsize=11,
                 frameon=True,
-                loc="center left",
-                bbox_to_anchor=(axes_width / total_width + 0.01, 0.5),
+                loc="upper right",
+                bbox_to_anchor=(0.995, 0.995),
             )
             if figure.legends:
                 for legend in figure.legends:
@@ -991,8 +1035,8 @@ def plot_rdoc_faceted_distribution(df, out_dir, simul_or_real, x_label="RDoC dom
                         txt.set_fontweight("bold")
 
         sns.despine(ax=ax, top=True, right=True)
-        # Leave right margin for the figure-level legend
-        figure.tight_layout(rect=[0, 0, axes_width / total_width, 1])
+        # Leave a slim top/right margin for the figure-level legend.
+        figure.tight_layout(rect=[0, 0, 0.94, 0.96])
 
         classifier_key = str(classifier).replace(" ", "_").replace("/", "-")
         embedding_key = str(embedding).replace(" ", "_").replace("/", "-")
@@ -1082,9 +1126,13 @@ def main():
         paths["out_dir"],
         args.simul_or_real,
         x_label=domain_x_label,
+        count_table=rdoc_group_count_table,
     )
     rdoc_group_heatmap_path = plot_rdoc_performance_group_heatmap(
-        rdoc_group_prop_table, paths["out_dir"], args.simul_or_real
+        rdoc_group_prop_table,
+        paths["out_dir"],
+        args.simul_or_real,
+        count_table=rdoc_group_count_table,
     )
 
     print(f"Saved dataframe with shape: {df.shape}")

From 29de31c30fbc568fa2417a993cb492bc7fca0ec7 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Fri, 24 Apr 2026 13:37:42 -0400
Subject: [PATCH 394/401] change session handling in load_TS

---
 .flake8                         |   1 +
 .pre-commit-config.yaml         |   2 +-
 pydfc/data_loader.py            |  76 ++++++++--------
 task_dFC/FCS_estimate.py        | 149 --------------------------------
 task_dFC/dFC_assessment.py      | 108 -----------------------
 task_dFC/nifti_to_roi_signal.py | 135 -----------------------------
 task_dFC/validation.py          |  58 -------------
 7 files changed, 43 insertions(+), 486 deletions(-)
 delete mode 100644 task_dFC/FCS_estimate.py
 delete mode 100644 task_dFC/dFC_assessment.py
 delete mode 100644 task_dFC/nifti_to_roi_signal.py
 delete mode 100644 task_dFC/validation.py

diff --git a/.flake8 b/.flake8
index 7f73516..b57c737 100644
--- a/.flake8
+++ b/.flake8
@@ -25,6 +25,7 @@ ignore =
     E731,
     E713,
     E714,
+    E722,
     E741,
     F403,
     F405,
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 70511d2..7a5618b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,5 +56,5 @@ repos:
     rev: 7.0.0
     hooks:
     -   id: flake8
-        args: [--config, .flake8, --verbose, pydfc, HCP_resting_state_analysis, task_dFC]
+        args: [--config, .flake8, --verbose, pydfc, HCP_resting_state_analysis]
         additional_dependencies: [flake8-use-fstring]
diff --git a/pydfc/data_loader.py b/pydfc/data_loader.py
index 59307ac..66ec2e8 100644
--- a/pydfc/data_loader.py
+++ b/pydfc/data_loader.py
@@ -527,25 +527,18 @@ def multi_nifti2timeseries(
 def load_TS(
     data_root,
     file_name,
-    SESSIONs,
     subj_id2load=None,
     task=None,
+    session=None,
     run=None,
 ):
     """
     load a TIME_SERIES object from a .npy file
-    if SESSIONs is a list, it will load all the sessions,
-        if it is a string, it will load that session
     if subj_id2load is None, it will load all the subjects
     file_name: name of the file to load
-        format example: {subj_id}_{task}_{run}_time-series.npy
+        format example: {subj_id}_{session}_{task}_{run}_time-series.npy
         (keep the {} for the variables)
     """
-    # check if SESSIONs is a list or a string
-    flag = False
-    if type(SESSIONs) is str:
-        SESSIONs = [SESSIONs]
-        flag = True
 
     if subj_id2load is None:
         SUBJECTS = find_subj_list(data_root)
@@ -553,37 +546,50 @@ def load_TS(
         assert "sub-" in subj_id2load, "subj_id2load must start with 'sub-'"
         SUBJECTS = [subj_id2load]
 
-    TS = {}
-    for session in SESSIONs:
-        TS[session] = None
-        for subj in SUBJECTS:
-            subj_fldr = subj
-            # make the file_name
-            TS_file = deepcopy(file_name)
-            if "{subj_id}" in file_name:
-                TS_file = TS_file.replace("{subj_id}", subj)
-            if "{task}" in file_name:
-                assert task is not None, "task must be provided"
-                TS_file = TS_file.replace("{task}", task)
-            if "{run}" in file_name:
-                assert run is not None, "run must be provided"
-                TS_file = TS_file.replace("{run}", run)
-
-            try:
+    TS = None
+    for subj in SUBJECTS:
+        subj_fldr = subj
+        # make the file_name
+        TS_file = deepcopy(file_name)
+        if "{subj_id}" in file_name:
+            TS_file = TS_file.replace("{subj_id}", subj)
+        if "{task}" in file_name:
+            assert task is not None, "task must be provided"
+            TS_file = TS_file.replace("{task}", task)
+        if "{session}" in file_name:
+            assert session is not None, "session must be provided"
+            TS_file = TS_file.replace("{session}", session)
+        if "{run}" in file_name:
+            assert run is not None, "run must be provided"
+            TS_file = TS_file.replace("{run}", run)
+
+        try:
+            if session is None:
                 time_series = np.load(
                     f"{data_root}/{subj_fldr}/{TS_file}", allow_pickle="True"
                 ).item()
-            except FileNotFoundError:
-                print(f"File {TS_file} not found for {subj}")
-                continue
-
-            if TS[session] is None:
-                TS[session] = time_series
             else:
-                TS[session].concat_ts(time_series)
+                time_series = np.load(
+                    f"{data_root}/{subj_fldr}/{session}/{TS_file}",
+                    allow_pickle="True",
+                ).item()
+        except FileNotFoundError:
+            print(f"File {TS_file} not found for {subj}")
+            continue
+
+        if TS is None:
+            TS = time_series
+        else:
+            try:
+                TS.concat_ts(time_series)
+            except AssertionError as e:
+                # print the error message
+                print(f"Error in concatenating time series for {subj}: {e}")
+                # raise error with a message and stop the program
+                raise Exception(
+                    f"Fs of subj {subj} TS is {time_series.Fs} while the group Fs is {TS.Fs}"
+                )
 
-    if flag:
-        return TS[SESSIONs[0]]
     return TS
 
 
diff --git a/task_dFC/FCS_estimate.py b/task_dFC/FCS_estimate.py
deleted file mode 100644
index de4d738..0000000
--- a/task_dFC/FCS_estimate.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import os
-import time
-import warnings
-
-import numpy as np
-
-from pydfc import MultiAnalysis, data_loader
-
-warnings.simplefilter("ignore")
-
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
-
-################################# Parameters #################################
-# data paths
-# main_root = '../../DATA/ds002785/' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785"  # for server
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-output_root = f"{main_root}/derivatives/fitted_MEASURES"
-
-# for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
-
-job_id = int(os.getenv("SGE_TASK_ID"))
-TASK_id = job_id - 1  # SGE_TASK_ID starts from 1 not 0
-if TASK_id >= len(TASKS):
-    print("TASK_id out of TASKS")
-    exit()
-task = TASKS[TASK_id]
-
-###### MEASUREMENT PARAMETERS ######
-
-# W is in sec
-
-params_methods = {
-    # Sliding Parameters
-    "W": 44,
-    "n_overlap": 1.0,
-    "sw_method": "pear_corr",
-    "tapered_window": True,
-    # TIME_FREQ
-    "TF_method": "WTC",
-    # CLUSTERING AND DHMM
-    "clstr_base_measure": "SlidingWindow",
-    # HMM
-    "hmm_iter": 20,
-    "dhmm_obs_state_ratio": 16 / 24,
-    # State Parameters
-    "n_states": 12,
-    "n_subj_clstrs": 20,
-    # Parallelization Parameters
-    "n_jobs": 2,
-    "verbose": 0,
-    "backend": "loky",
-    # SESSION
-    "session": task,
-    # Hyper Parameters
-    "normalization": True,
-    "num_subj": None,  # None or 216?
-    "num_time_point": None,  # None or set?
-}
-
-###### HYPER PARAMETERS ALTERNATIVE ######
-
-MEASURES_name_lst = [
-    "SlidingWindow",
-    "Time-Freq",
-    "CAP",
-    "ContinuousHMM",
-    "Windowless",
-    "Clustering",
-    "DiscreteHMM",
-]
-
-alter_hparams = {
-    # 'session': ['Rest1_RL', 'Rest2_LR', 'Rest2_RL'],
-    # 'n_overlap': [0, 0.25, 0.75, 1],
-    # 'n_states': [6, 16],
-    # # 'normalization': [],
-    # 'num_subj': [50, 100, 200],
-    # 'num_select_nodes': [30, 50, 333],
-    # 'num_time_point': [800, 1000],
-    # 'Fs_ratio': [0.50, 0.75, 1.5],
-    # 'noise_ratio': [1.00, 2.00, 3.00],
-    # 'num_realization': []
-}
-
-###### MultiAnalysis PARAMETERS ######
-
-params_multi_analysis = {
-    # Parallelization Parameters
-    "n_jobs": None,
-    "verbose": 0,
-    "backend": "loky",
-}
-
-################################# LOAD DATA #################################
-
-BOLD = data_loader.load_TS(
-    data_root=roi_root, file_name="time_series.npy", SESSIONs=task, subj_id2load=None
-)
-
-################################# Visualize BOLD #################################
-
-# for session in BOLD:
-#     BOLD.visualize(start_time=0, end_time=2000, nodes_lst=list(range(10)),
-#         save_image=False, output_root=None)
-
-################################ Measures of dFC #################################
-
-MA = MultiAnalysis(
-    analysis_name=f"task-based-dFC-ds002785-{task}", **params_multi_analysis
-)
-
-MEASURES_lst = MA.measures_initializer(MEASURES_name_lst, params_methods, alter_hparams)
-
-tic = time.time()
-print("Measurement Started ...")
-
-################################# estimate FCS #################################
-
-for MEASURE_id, measure in enumerate(MEASURES_lst):
-
-    print("MEASURE: " + measure.measure_name)
-    print("FCS estimation started...")
-
-    if measure.is_state_based:
-        measure.estimate_FCS(time_series=BOLD)
-
-    # dFC_analyzer.estimate_group_FCS(time_series_dict=BOLD)
-    print("FCS estimation done.")
-
-    # Save
-    if not os.path.exists(f"{output_root}/{task}"):
-        os.makedirs(f"{output_root}/{task}")
-    np.save(f"{output_root}/{task}/MEASURE_{str(MEASURE_id)}.npy", measure)
-
-print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-np.save(f"{output_root}/{task}/multi_analysis.npy", MA)
-
-#################################################################################
diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
deleted file mode 100644
index a381f95..0000000
--- a/task_dFC/dFC_assessment.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import os
-import time
-import warnings
-
-import numpy as np
-
-from pydfc import MultiAnalysis, data_loader
-
-warnings.simplefilter("ignore")
-
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
-
-################################# Parameters #################################
-
-# Data parameters
-# main_root = '../../DATA/ds002785/' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785/"  # for server
-
-# subjects used for dFC assessment do not need to be the same as those used for FCS_estimate
-# you can set the new roi root and data load parameters here:
-roi_root = f"{main_root}/derivatives/ROI_timeseries"
-fitted_measures_root = f"{main_root}/derivatives/fitted_MEASURES"
-output_root = f"{main_root}/derivatives/dFC_assessed"
-
-# for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
-
-# find all subjects across all tasks
-SUBJECTS = data_loader.find_subj_list(data_root=roi_root, sessions=TASKS)
-
-# job_id selects the subject
-job_id = int(os.getenv("SGE_TASK_ID"))
-if job_id > len(SUBJECTS):
-    print("job_id > len(SUBJECTS)")
-    exit()
-subj_id = SUBJECTS[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
-
-for task in TASKS:
-
-    MA = np.load(
-        f"{fitted_measures_root}/{task}/multi_analysis.npy", allow_pickle="TRUE"
-    ).item()
-
-    # check if the subject has this task
-    SUBJECTS_with_this_task = data_loader.find_subj_list(
-        data_root=roi_root, sessions=[task]
-    )
-    if not subj_id in SUBJECTS_with_this_task:
-        print(f"subject {subj_id} not in the list of subjects with task {task}")
-        continue
-
-    ################################# LOAD FIT MEASURES #################################
-
-    ALL_RECORDS = os.listdir(f"{fitted_measures_root}/{task}/")
-    ALL_RECORDS = [i for i in ALL_RECORDS if "MEASURE" in i]
-    ALL_RECORDS.sort()
-    MEASURES_fit_lst = list()
-    for s in ALL_RECORDS:
-        fit_measure = np.load(
-            f"{fitted_measures_root}/{task}/{s}", allow_pickle="TRUE"
-        ).item()
-        MEASURES_fit_lst.append(fit_measure)
-    MA.set_MEASURES_fit_lst(MEASURES_fit_lst)
-    print("fitted MEASURES loaded ...")
-
-    ################################# LOAD DATA #################################
-
-    print(
-        f"subject-level dFC assessment CODE started running ... for task {task} of subject {subj_id} ..."
-    )
-
-    BOLD = data_loader.load_TS(
-        data_root=roi_root,
-        file_name="time_series.npy",
-        SESSIONs=[task],
-        subj_id2load=subj_id,
-    )
-
-    ################################# dFC ASSESSMENT #################################
-
-    tic = time.time()
-    print("Measurement Started ...")
-
-    print("dFC estimation started...")
-    dFC_dict = MA.subj_lvl_dFC_assess(time_series_dict=BOLD)
-    print("dFC estimation done.")
-
-    print(f"Measurement required {time.time() - tic:0.3f} seconds.")
-
-    ################################# SAVE DATA #################################
-
-    folder = f"{output_root}/{task}/{subj_id}"
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-
-    for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
-        np.save(f"{folder}/dFC_{str(dFC_id)}.npy", dFC)
-
-#######################################################################################
diff --git a/task_dFC/nifti_to_roi_signal.py b/task_dFC/nifti_to_roi_signal.py
deleted file mode 100644
index 1e52cb8..0000000
--- a/task_dFC/nifti_to_roi_signal.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import json
-import os
-import warnings
-
-import numpy as np
-
-from pydfc import data_loader, task_utils
-
-warnings.simplefilter("ignore")
-
-################################# Parameters #################################
-# data paths
-# main_root = '../../DATA/ds002785' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785"  # for server
-fmriprep_root = f"{main_root}/derivatives/fmriprep"
-output_root = f"{main_root}/derivatives/ROI_timeseries"
-
-bold_suffix = "_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz"
-
-# for consistency we use 0 for resting state
-TASKS = [
-    "task-restingstate",
-    "task-anticipation",
-    "task-emomatching",
-    "task-faces",
-    "task-gstroop",
-    "task-workingmemory",
-]
-
-# find all subjects
-ALL_SUBJs = os.listdir(fmriprep_root)
-ALL_SUBJs = [i for i in ALL_SUBJs if ("sub-" in i) and (not ".html" in i)]
-ALL_SUBJs.sort()
-
-# pick the subject
-job_id = int(os.getenv("SGE_TASK_ID"))
-subj = ALL_SUBJs[job_id - 1]  # SGE_TASK_ID starts from 1 not 0
-
-print(
-    f"subject-level ROI signal extraction CODE started running ... for subject: {subj} ..."
-)
-################################# FIND THE FUNC FILE #################################
-for task in TASKS:
-    # find the func file for this subject and task
-    ALL_TASK_FILES = os.listdir(f"{fmriprep_root}/{subj}/func/")
-    ALL_TASK_FILES = [
-        i for i in ALL_TASK_FILES if (bold_suffix in i) and (task in i)
-    ]  # only keep the denoised files? or use the original files?
-    # print(ALL_TASK_FILES)
-    if not len(ALL_TASK_FILES) == 1:
-        # if the func file is not found, exclude the subject
-        print("Func file not found for " + subj + " " + task)
-        continue
-    fmriprep_file = f"{fmriprep_root}/{subj}/func/{ALL_TASK_FILES[0]}"
-    info_file = (
-        f"{main_root}/{subj}/func/{ALL_TASK_FILES[0].replace(bold_suffix, '_bold.json')}"
-    )
-
-    ################################# LOAD JSON INFO #########################
-    # Opening JSON file as a dictionary
-    f = open(info_file)
-    acquisition_data = json.load(f)
-    f.close()
-    TR_mri = acquisition_data["RepetitionTime"]
-    ################################# EXTRACT TIME SERIES #########################
-    # extract ROI signals and convert to TIME_SERIES object
-    time_series = data_loader.nifti2timeseries(
-        nifti_file=fmriprep_file,
-        n_rois=100,
-        Fs=1 / TR_mri,
-        subj_id=subj,
-        confound_strategy="no_motion",
-        standardize="zscore",
-        TS_name="BOLD",
-        session=task,
-    )
-    num_time_mri = time_series.n_time
-    ################################# EXTRACT TASK LABELS #########################
-    oversampling = 50  # more samples per TR than the func data to have a better event_labels time resolution
-    if task == "task-restingstate":
-        events = []
-        event_types = ["rest"]
-        event_labels = np.zeros((int(num_time_mri * oversampling), 1))
-        task_labels = np.zeros((int(num_time_mri * oversampling), 1))
-        Fs_task = float(1 / TR_mri) * oversampling
-    else:
-        task_events_root = f"{main_root}/{subj}/func/"
-        ALL_EVENTS_FILES = os.listdir(task_events_root)
-        ALL_EVENTS_FILES = [
-            i
-            for i in ALL_EVENTS_FILES
-            if (subj in i) and (task in i) and ("events.tsv" in i)
-        ]
-        if not len(ALL_EVENTS_FILES) == 1:
-            # if the events file is not found, exclude the subject
-            print("Events file not found for " + subj + " " + task)
-            continue
-        # load the tsv events file
-        events_file = task_events_root + ALL_EVENTS_FILES[0]
-        events = np.genfromtxt(events_file, delimiter="\t", dtype=str)
-        # get the task labels
-        event_types = ["rest"] + list(np.unique(events[1:, 2]))
-        event_labels, Fs_task = task_utils.events_time_to_labels(
-            events=events,
-            TR_mri=TR_mri,
-            num_time_mri=num_time_mri,
-            event_types=event_types,
-            oversampling=oversampling,
-            return_0_1=False,
-        )
-        # fill task labels with 0 (rest) and k (task's index)
-        task_labels = np.multiply(event_labels != 0, TASKS.index(task))
-    ################################# SAVE #################################
-    # save the ROI time series and task data
-    task_data = {
-        "task": task,
-        "task_labels": task_labels,
-        "task_types": TASKS,
-        "event_labels": event_labels,
-        "event_types": event_types,
-        "events": events,
-        "Fs_task": Fs_task,
-        "TR_mri": TR_mri,
-        "num_time_mri": num_time_mri,
-    }
-    subj_folder = f"{subj}_{task}"
-    if not os.path.exists(f"{output_root}/{subj_folder}/"):
-        os.makedirs(f"{output_root}/{subj_folder}/")
-    np.save(f"{output_root}/{subj_folder}/time_series.npy", time_series)
-    np.save(f"{output_root}/{subj_folder}/task_data.npy", task_data)
-
-print(
-    f"subject-level ROI signal extraction CODE finished running ... for subject: {subj} ..."
-)
-####################################################################
diff --git a/task_dFC/validation.py b/task_dFC/validation.py
deleted file mode 100644
index 05fcb24..0000000
--- a/task_dFC/validation.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import os
-import time
-import warnings
-
-import numpy as np
-
-from pydfc import MultiAnalysis, data_loader
-
-warnings.simplefilter("ignore")
-
-os.environ["MKL_NUM_THREADS"] = "16"
-os.environ["NUMEXPR_NUM_THREADS"] = "16"
-os.environ["OMP_NUM_THREADS"] = "16"
-
-################################# Parameters #################################
-
-# Data parameters
-# main_root = '../../DATA/ds002785/' # for local
-main_root = "../../../DATA/task-based/openneuro/ds002785/"  # for server
-dFC_assessed_root = main_root + "dFC_assessed/"
-output_root = main_root + "validation_results/"
-
-################################# LOAD FIT MEASURES #################################
-
-SUBJECTS = data_loader.find_subj_list(
-    data_root=roi_root, sessions=params_data_load["SESSIONs"]
-)
-
-ALL_RECORDS = os.listdir(dFC_assessed_root)
-ALL_RECORDS = [i for i in ALL_RECORDS if "dFC" in i]
-ALL_RECORDS.sort()
-dFC_lst = list()
-for s in ALL_RECORDS:
-    dFC = np.load(dFC_assessed_root + s, allow_pickle="TRUE").item()
-    dFC_lst.append(dFC)
-print("dFCs loaded ...")
-
-################################# SIMILARITY MEASUREMENT #################################
-
-# similarity_assessment = SIMILARITY_ASSESSMENT(dFCM_lst=dFCM_dict['dFCM_lst'])
-
-# tic = time.time()
-# print('Measurement Started ...')
-
-# print("Similarity measurement started...")
-# SUBJ_output = similarity_assessment.run(FILTERS=dFC_analyzer.hyper_param_info, downsampling_method='default')
-# print("Similarity measurement done.")
-
-# print('Measurement required %0.3f seconds.' % (time.time() - tic, ))
-
-# # Save
-# folder = output_root+'similarity_measured'
-# if not os.path.exists(folder):
-#     os.makedirs(folder)
-
-# np.save(folder+'/SUBJ_'+str(subj_id)+'_output.npy', SUBJ_output)
-
-#######################################################################################

From 73cae4ccb6d4938b9c2424272bdcacb75cdb336e Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 25 Apr 2026 19:28:56 -0400
Subject: [PATCH 395/401] float32 for saving dFC in dFC_assessment.py

---
 task_dFC/dFC_assessment.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/task_dFC/dFC_assessment.py b/task_dFC/dFC_assessment.py
index 2be912a..1269e37 100644
--- a/task_dFC/dFC_assessment.py
+++ b/task_dFC/dFC_assessment.py
@@ -136,6 +136,12 @@ def run_dFC_assess(
         os.makedirs(folder)
 
     for dFC_id, dFC in enumerate(dFC_dict["dFC_lst"]):
+
+        # Optional: cast each dFC to float32 to save space
+        dFC.FCSs_ = {
+            key: value.astype(np.float32, copy=False) for key, value in dFC.FCSs_.items()
+        }
+
         np.save(f"{folder}dFC_{file_suffix}_{dFC_id}.npy", dFC)
 
 

From 6f721a8290e715cf657a44d3c859e90373c4d886 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Sat, 25 Apr 2026 20:58:36 -0400
Subject: [PATCH 396/401] improve measure handling in ml_utils

---
 pydfc/ml_utils.py                             | 61 ++++++++++---------
 .../embedding_visualization.py                | 43 ++++++-------
 .../sample_matrix_visualization.py            | 16 +----
 3 files changed, 56 insertions(+), 64 deletions(-)

diff --git a/pydfc/ml_utils.py b/pydfc/ml_utils.py
index 7052efa..f183ae8 100644
--- a/pydfc/ml_utils.py
+++ b/pydfc/ml_utils.py
@@ -339,6 +339,7 @@ def dFC_feature_extraction(
     If False, use dFC_vecs (dFC matrix as features).
     """
     dFC_measure_name = None
+    measure_is_state_based = None
     X_train = None
     y_train = None
     subj_label_train = list()
@@ -377,12 +378,17 @@ def dFC_feature_extraction(
             y_train = np.concatenate((y_train, y_subj), axis=0)
 
         dFC_measure_name_new = dFC.measure.measure_name
+        measure_is_state_based_new = dFC.measure.is_state_based
         if dFC_measure_name is None:
             dFC_measure_name = dFC_measure_name_new
+            measure_is_state_based = measure_is_state_based_new
         else:
             assert (
                 dFC_measure_name == dFC_measure_name_new
             ), "dFC measure is not consistent."
+            assert (
+                measure_is_state_based == measure_is_state_based_new
+            ), "dFC measure is not consistent."
 
     X_test = None
     y_test = None
@@ -421,12 +427,17 @@ def dFC_feature_extraction(
             y_test = np.concatenate((y_test, y_subj), axis=0)
 
         dFC_measure_name_new = dFC.measure.measure_name
+        measure_is_state_based_new = dFC.measure.is_state_based
         if dFC_measure_name is None:
             dFC_measure_name = dFC_measure_name_new
+            measure_is_state_based = measure_is_state_based_new
         else:
             assert (
                 dFC_measure_name == dFC_measure_name_new
             ), "dFC measure is not consistent."
+            assert (
+                measure_is_state_based == measure_is_state_based_new
+            ), "dFC measure is not consistent."
 
     # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
     subj_label_train = np.array(subj_label_train)
@@ -440,6 +451,7 @@ def dFC_feature_extraction(
         subj_label_train,
         subj_label_test,
         dFC_measure_name,
+        measure_is_state_based,
     )
 
 
@@ -1974,35 +1986,28 @@ def task_presence_classification(
         f"Number of train subjects: {len(train_subjects)} and test subjects: {len(test_subjects)}"
     )
 
-    X_train, X_test, y_train, y_test, subj_label_train, subj_label_test, measure_name = (
-        dFC_feature_extraction(
-            task=task,
-            train_subjects=train_subjects,
-            test_subjects=test_subjects,
-            dFC_id=dFC_id,
-            roi_root=roi_root,
-            dFC_root=dFC_root,
-            run=run,
-            session=session,
-            dynamic_pred=dynamic_pred,
-            normalize_dFC=normalize_dFC,
-            FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
-        )
+    (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        subj_label_train,
+        subj_label_test,
+        measure_name,
+        measure_is_state_based,
+    ) = dFC_feature_extraction(
+        task=task,
+        train_subjects=train_subjects,
+        test_subjects=test_subjects,
+        dFC_id=dFC_id,
+        roi_root=roi_root,
+        dFC_root=dFC_root,
+        run=run,
+        session=session,
+        dynamic_pred=dynamic_pred,
+        normalize_dFC=normalize_dFC,
+        FCS_proba_for_SB=True,  # for state-based dFC features, we use FCS_proba
     )
-    measure_is_state_based = None
-    if measure_name in ["SlidingWindow", "Time-Freq"]:
-        measure_is_state_based = False
-    elif measure_name in [
-        "CAP",
-        "Clustering",
-        "ContinuousHMM",
-        "DiscreteHMM",
-        "Windowless",
-    ]:
-        measure_is_state_based = True
-    else:
-        # raise error
-        raise ValueError(f"Unknown measure name: {measure_name}")
 
     if measure_is_state_based:
         X_train = process_SB_features(X=X_train, measure_name=measure_name)
diff --git a/task_dFC/multi_dataset_analysis/embedding_visualization.py b/task_dFC/multi_dataset_analysis/embedding_visualization.py
index fd20315..e3975d6 100644
--- a/task_dFC/multi_dataset_analysis/embedding_visualization.py
+++ b/task_dFC/multi_dataset_analysis/embedding_visualization.py
@@ -109,20 +109,27 @@
                             SUBJECTS = SUBJECTS[0:1]
                             print(f"Number of subjects: {len(SUBJECTS)}")
 
-                            X, _, y, _, subj_label, _, measure_name = (
-                                dFC_feature_extraction(
-                                    task=task,
-                                    train_subjects=SUBJECTS,
-                                    test_subjects=[],
-                                    dFC_id=dFC_id,
-                                    roi_root=roi_root,
-                                    dFC_root=dFC_root,
-                                    run=run,
-                                    session=session,
-                                    dynamic_pred="no",
-                                    normalize_dFC=normalize_dFC,
-                                    FCS_proba_for_SB=True,
-                                )
+                            (
+                                X,
+                                _,
+                                y,
+                                _,
+                                subj_label,
+                                _,
+                                measure_name,
+                                measure_is_state_based,
+                            ) = dFC_feature_extraction(
+                                task=task,
+                                train_subjects=SUBJECTS,
+                                test_subjects=[],
+                                dFC_id=dFC_id,
+                                roi_root=roi_root,
+                                dFC_root=dFC_root,
+                                run=run,
+                                session=session,
+                                dynamic_pred="no",
+                                normalize_dFC=normalize_dFC,
+                                FCS_proba_for_SB=True,
                             )
 
                             assert (
@@ -132,13 +139,7 @@
                                 X.shape[0] == subj_label.shape[0]
                             ), "Number of samples do not match."
 
-                            if measure_name in [
-                                "CAP",
-                                "Clustering",
-                                "ContinuousHMM",
-                                "DiscreteHMM",
-                                "Windowless",
-                            ]:
+                            if measure_is_state_based:
                                 X = process_SB_features(X=X, measure_name=measure_name)
 
                             print(f"Task: {task}")
diff --git a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
index 0584b53..7f3bfcd 100644
--- a/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
+++ b/task_dFC/multi_dataset_analysis/sample_matrix_visualization.py
@@ -155,6 +155,7 @@
                             subj_label_train,
                             subj_label_test,
                             measure_name,
+                            measure_is_state_based,
                         ) = dFC_feature_extraction(
                             task=task,
                             train_subjects=train_subjects,
@@ -175,21 +176,6 @@
                             )
                             continue
 
-                        measure_is_state_based = None
-                        if measure_name in ["SlidingWindow", "Time-Freq"]:
-                            measure_is_state_based = False
-                        elif measure_name in [
-                            "CAP",
-                            "Clustering",
-                            "ContinuousHMM",
-                            "DiscreteHMM",
-                            "Windowless",
-                        ]:
-                            measure_is_state_based = True
-                        else:
-                            # raise error
-                            raise ValueError(f"Unknown measure name: {measure_name}")
-
                         if measure_is_state_based:
                             X_train = process_SB_features(
                                 X=X_train, measure_name=measure_name

From f8e5b042e427a873e5f1f575d7530a7bf595c440 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Mon, 27 Apr 2026 16:22:22 -0400
Subject: [PATCH 397/401] minor change in dfc_visualization

Co-authored-by: Copilot <copilot@github.com>
---
 .../dfc_visualization.py                      | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/task_dFC/multi_dataset_analysis/dfc_visualization.py b/task_dFC/multi_dataset_analysis/dfc_visualization.py
index 52199d8..c028069 100644
--- a/task_dFC/multi_dataset_analysis/dfc_visualization.py
+++ b/task_dFC/multi_dataset_analysis/dfc_visualization.py
@@ -1,6 +1,7 @@
 import argparse
 import json
 import os
+import re
 import sys
 
 from pydfc.dfc_utils import TR_intersection, rank_norm
@@ -14,6 +15,20 @@
 
 normalize_dFC = True
 
+
+def discover_available_dfc_ids(dfc_root):
+    """Return the sorted dFC IDs found anywhere under ``dfc_root``."""
+    dfc_ids = set()
+    for root, _, files in os.walk(dfc_root):
+        for file_name in files:
+            if not file_name.endswith(".npy"):
+                continue
+            match = re.search(r"_(\d+)\.npy$", file_name)
+            if match:
+                dfc_ids.add(int(match.group(1)))
+    return sorted(dfc_ids)
+
+
 #######################################################################################
 
 if __name__ == "__main__":
@@ -87,7 +102,12 @@
             RUNS = {task: [None] for task in TASKS}
 
         DATA = {}
-        for dFC_id in range(0, 7):
+        dFC_ids = discover_available_dfc_ids(dFC_root)
+        if len(dFC_ids) == 0:
+            print(f"No dFC files found under {dFC_root}; skipping dataset {dataset}.")
+            continue
+
+        for dFC_id in dFC_ids:
             for session in SESSIONS[:1]:  # Only process the first session
                 for task_id, task in enumerate(TASKS):
                     for run in RUNS[task][:1]:  # Only process the first run

From ef0223eb1caacbd200371d069b6cbb28e264db15 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Tue, 9 Jun 2026 23:34:07 -0400
Subject: [PATCH 398/401] improve cohensd supp figure

---
 task_dFC/multi_dataset_analysis/cohensd.py | 118 ++++++++++++++-------
 1 file changed, 80 insertions(+), 38 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index f3801e2..155ae77 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -21,6 +21,80 @@
 
 #######################################################################################
 
+
+def plot_cohensd_per_experiment(
+    df,
+    experiment_order,
+    save_path,
+    y_col="abs_d",
+    y_label="|Cohen's d|",
+):
+    """
+    Boxplot + individual points of |Cohen's d| per experiment.
+
+    Boxplot drawn first (solid, visible fill). Points drawn on top in a single
+    contrasting color with transparency so the box statistics remain readable.
+    Simulated data uses symlog y-scale to handle extreme outliers.
+    """
+    fig_width = max(10, 0.7 * len(experiment_order))
+    fig, ax = plt.subplots(figsize=(fig_width, 7))
+
+    # 1. Boxplot first — solid, clearly visible
+    sns.boxplot(
+        data=df,
+        x="experiment",
+        y=y_col,
+        order=experiment_order,
+        showfliers=False,
+        width=0.55,
+        linewidth=2.0,
+        color="#2057B6",
+        ax=ax,
+    )
+    # Bring fill opacity to 0.45 — visible but not opaque
+    for patch in ax.patches:
+        r, g, b, _ = patch.get_facecolor()
+        patch.set_facecolor((r, g, b, 0.45))
+    # Make the median line stand out
+    for line in ax.lines:
+        if line.get_linestyle() == "-":
+            line.set_linewidth(2.5)
+
+    # 2. Points on top — single contrasting color, semi-transparent
+    sns.stripplot(
+        data=df,
+        x="experiment",
+        y=y_col,
+        order=experiment_order,
+        color="#C0392B",
+        dodge=False,
+        jitter=0.18,
+        size=6,
+        alpha=0.85,
+        linewidth=2.0,
+        edgecolor="#7B241C",
+        ax=ax,
+    )
+
+    if ax.legend_:
+        ax.legend_.remove()
+
+    ax.set_ylim(bottom=0)
+
+    ax.set_xlabel("Experiment", fontsize=13, fontweight="bold")
+    ax.set_ylabel(y_label, fontsize=13, fontweight="bold")
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", fontsize=11)
+    plt.setp(ax.get_yticklabels(), fontsize=11)
+    sns.despine(ax=ax)
+    plt.tight_layout()
+
+    os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
+    plt.savefig(save_path, dpi=150, bbox_inches="tight", pad_inches=0.2, format="png")
+    plt.close()
+
+
+#######################################################################################
+
 if __name__ == "__main__":
     # argparse
     HELPTEXT = """
@@ -346,46 +420,14 @@
     fig_width = max(14, 0.6 * len(task_order))
 
     # -------- Figure 1: Boxplot of |Cohen's d| per task with individual samples --------
-    plt.figure(figsize=(fig_width, 7))
-
-    # Boxplot (hide outliers to avoid double-plotting with the samples)
-    ax = sns.boxplot(
-        data=DF,
-        x="experiment",
-        y="abs_d",
-        order=experiment_order,
-        showfliers=False,
-        width=0.6,
+    plot_cohensd_per_experiment(
+        df=DF,
+        experiment_order=experiment_order,
+        save_path=f"{output_root}/CohensD_abs_boxplot_with_samples_per_task.png",
+        y_col="abs_d",
+        y_label="|Cohen's d|",
     )
 
-    # Overlay individual samples (one point per ROI sample)
-    sns.stripplot(
-        data=DF,
-        x="experiment",
-        y="abs_d",
-        order=experiment_order,
-        dodge=False,
-        jitter=0.25,
-        size=2,
-        alpha=0.45,
-        ax=ax,
-    )
-
-    ax.set_xlabel("Experiment")
-    ax.set_ylabel("|Cohen's d|")
-    ax.set_ylim(bottom=0)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
-    plt.tight_layout()
-
-    plt.savefig(
-        f"{output_root}/CohensD_abs_boxplot_with_samples_per_task.png",
-        dpi=150,
-        bbox_inches="tight",
-        pad_inches=0.2,
-        format="png",
-    )
-    plt.close()
-
     # -------- Figure 2: Max |Cohen's d| across ROIs per task --------
     plt.figure(figsize=(fig_width, 6))
 

From af302d6eb856656f7b5a9f50a054e8748b6fb128 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Jun 2026 07:56:00 -0400
Subject: [PATCH 399/401] improve cohensd boxplot

---
 task_dFC/multi_dataset_analysis/cohensd.py | 96 +++++++++++++---------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/task_dFC/multi_dataset_analysis/cohensd.py b/task_dFC/multi_dataset_analysis/cohensd.py
index 155ae77..3b892d1 100644
--- a/task_dFC/multi_dataset_analysis/cohensd.py
+++ b/task_dFC/multi_dataset_analysis/cohensd.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+from matplotlib.colors import to_rgba
 from nilearn import datasets, plotting
 
 from pydfc import data_loader
@@ -22,6 +23,15 @@
 #######################################################################################
 
 
+_BOX_COLOR = "#4472C4"
+_POINT_COLOR = "#C0392B"
+_POINT_EDGE_COLOR = "#7B241C"
+_BOX_OFFSET = -0.17  # box center relative to x-tick
+_STRIP_OFFSET = 0.17  # point cloud center relative to x-tick
+_BOX_WIDTH = 0.28
+_STRIP_JITTER = 0.09
+
+
 def plot_cohensd_per_experiment(
     df,
     experiment_order,
@@ -30,54 +40,62 @@ def plot_cohensd_per_experiment(
     y_label="|Cohen's d|",
 ):
     """
-    Boxplot + individual points of |Cohen's d| per experiment.
+    Boxplot (left of tick) + individual points (right of tick) per experiment.
 
-    Boxplot drawn first (solid, visible fill). Points drawn on top in a single
-    contrasting color with transparency so the box statistics remain readable.
+    Boxes and points are spatially separated so neither buries the other.
     Simulated data uses symlog y-scale to handle extreme outliers.
     """
     fig_width = max(10, 0.7 * len(experiment_order))
     fig, ax = plt.subplots(figsize=(fig_width, 7))
 
-    # 1. Boxplot first — solid, clearly visible
-    sns.boxplot(
-        data=df,
-        x="experiment",
-        y=y_col,
-        order=experiment_order,
+    n = len(experiment_order)
+    positions = np.arange(n)
+    exp_to_idx = {exp: i for i, exp in enumerate(experiment_order)}
+
+    # --- Boxplot left of center ---
+    box_data = [
+        df[df["experiment"] == exp][y_col].dropna().values for exp in experiment_order
+    ]
+    bp = ax.boxplot(
+        box_data,
+        positions=positions + _BOX_OFFSET,
+        widths=_BOX_WIDTH,
         showfliers=False,
-        width=0.55,
-        linewidth=2.0,
-        color="#2057B6",
-        ax=ax,
-    )
-    # Bring fill opacity to 0.45 — visible but not opaque
-    for patch in ax.patches:
-        r, g, b, _ = patch.get_facecolor()
-        patch.set_facecolor((r, g, b, 0.45))
-    # Make the median line stand out
-    for line in ax.lines:
-        if line.get_linestyle() == "-":
-            line.set_linewidth(2.5)
-
-    # 2. Points on top — single contrasting color, semi-transparent
-    sns.stripplot(
-        data=df,
-        x="experiment",
-        y=y_col,
-        order=experiment_order,
-        color="#C0392B",
-        dodge=False,
-        jitter=0.18,
-        size=6,
-        alpha=0.85,
-        linewidth=2.0,
-        edgecolor="#7B241C",
-        ax=ax,
+        patch_artist=True,
+        medianprops=dict(color="#1A1A1A", linewidth=2.5),
+        boxprops=dict(linewidth=1.8),
+        whiskerprops=dict(linewidth=1.6),
+        capprops=dict(linewidth=1.6),
     )
+    for patch in bp["boxes"]:
+        patch.set_facecolor(to_rgba(_BOX_COLOR, 0.5))
+        patch.set_edgecolor(_BOX_COLOR)
+    for line in bp["whiskers"] + bp["caps"]:
+        line.set_color(_BOX_COLOR)
+
+    # --- Strip points right of center ---
+    rng = np.random.default_rng(42)
+    for exp in experiment_order:
+        vals = df[df["experiment"] == exp][y_col].dropna().values
+        if len(vals) == 0:
+            continue
+        x_jit = (exp_to_idx[exp] + _STRIP_OFFSET) + rng.uniform(
+            -_STRIP_JITTER, _STRIP_JITTER, len(vals)
+        )
+        ax.scatter(
+            x_jit,
+            vals,
+            color=_POINT_COLOR,
+            alpha=0.55,
+            s=30,
+            linewidths=0.5,
+            edgecolors=_POINT_EDGE_COLOR,
+            zorder=3,
+        )
 
-    if ax.legend_:
-        ax.legend_.remove()
+    ax.set_xticks(positions)
+    ax.set_xticklabels(experiment_order)
+    ax.set_xlim(-0.6, n - 0.4)
 
     ax.set_ylim(bottom=0)
 

From f4858268f378cec5283b1acfbad8ca6a3351414c Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Jun 2026 14:52:02 -0400
Subject: [PATCH 400/401] convert run scripts to templates for publication

---
 simul_dFC/README.rst                          |  27 +++
 simul_dFC/run_scripts_sge/run_simulator.sh    |  40 +++-
 simul_dFC/run_scripts_slurm/run_simulator.sh  |  13 +-
 .../tasks_info_ds003465.json                  |   8 +-
 task_dFC/README.rst                           | 121 +++++++---
 task_dFC/run_scripts_sge/dataset_info.json    |  14 +-
 task_dFC/run_scripts_sge/global_configs.json  | 208 ++++++++++++++----
 task_dFC/run_scripts_sge/methods_config.json  |  13 +-
 .../run_scripts_sge/multi_dataset_info.json   |  38 ++++
 task_dFC/run_scripts_sge/run_FCS.sh           |  31 ++-
 task_dFC/run_scripts_sge/run_ML.sh            |  21 +-
 .../run_across_dataset_analysis.sh            |  47 ++++
 task_dFC/run_scripts_sge/run_dFC.sh           |  24 +-
 task_dFC/run_scripts_sge/run_fmriprep.sh      |  33 +--
 task_dFC/run_scripts_sge/run_nifti_to_roi.sh  |  38 +++-
 task_dFC/run_scripts_sge/run_report.sh        |  21 +-
 task_dFC/run_scripts_slurm/dataset_info.json  |   6 +-
 task_dFC/run_scripts_slurm/global_config.json |   6 +-
 .../run_scripts_slurm/multi_dataset_info.json |   6 +-
 task_dFC/run_scripts_slurm/run_FCS.sh         |  11 +-
 task_dFC/run_scripts_slurm/run_ML.sh          |  11 +-
 .../run_across_dataset_analysis.sh            |  14 +-
 task_dFC/run_scripts_slurm/run_dFC.sh         |  11 +-
 task_dFC/run_scripts_slurm/run_fmriprep.sh    |   6 +-
 .../run_scripts_slurm/run_nifti_to_roi.sh     |   9 +-
 task_dFC/run_scripts_slurm/run_report.sh      |  11 +-
 26 files changed, 599 insertions(+), 189 deletions(-)
 create mode 100644 simul_dFC/README.rst
 create mode 100644 task_dFC/run_scripts_sge/multi_dataset_info.json
 create mode 100644 task_dFC/run_scripts_sge/run_across_dataset_analysis.sh

diff --git a/simul_dFC/README.rst b/simul_dFC/README.rst
new file mode 100644
index 0000000..2c54299
--- /dev/null
+++ b/simul_dFC/README.rst
@@ -0,0 +1,27 @@
+============================================
+PydFC: simul_dFC Module Documentation
+============================================
+
+The ``simul_dFC`` module generates **synthetic task-based fMRI data** for benchmarking dFC methods under controlled conditions.
+
+It uses `The Virtual Brain (TVB) <https://www.thevirtualbrain.org>`_ simulator to produce BOLD signals driven by a known task design, allowing ground-truth evaluation of dFC methods.
+
+Two task paradigms are supported:
+
+*   **Real task-derived** (``tasks_info_ds003465.json``) — task timing extracted from an OpenNeuro dataset (ds003465) to drive the simulation.
+*   **Synthetic pulse-train** (``tasks_info_pulseTrain.json``) — parametric block designs with configurable onset, duration, and frequency.
+
+Running
+-------
+
+Set ``VENV_PATH`` and ``PYDFC_CODE_DIR`` in the cluster configuration block at the top of the job script, then submit::
+
+    # SLURM
+    sbatch --array=1-N run_scripts_slurm/run_simulator.sh
+
+    # SGE
+    qsub -t 1-N run_scripts_sge/run_simulator.sh
+
+The script expects a ``subj_list.txt`` (one subject ID per line), a ``dataset_info.json``, and a ``tasks_info.json`` in the same directory as the run script.
+
+Simulated outputs are consumed directly by the ``task_dFC`` pipeline starting at ``dFC_assessment.py``.
diff --git a/simul_dFC/run_scripts_sge/run_simulator.sh b/simul_dFC/run_scripts_sge/run_simulator.sh
index 6176236..f25fa25 100644
--- a/simul_dFC/run_scripts_sge/run_simulator.sh
+++ b/simul_dFC/run_scripts_sge/run_simulator.sh
@@ -1,19 +1,37 @@
-#!/bin/sh
+#!/bin/bash
 #
-#$ -cwd
-#$ -j y
+#$ -N simul_dfc_job
 #$ -o logs/simul_out.txt
 #$ -e logs/simul_err.txt
-#$ -q origami.q
-#$ -l h_vmem=8G
-#$ -t 1:200
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=8g
+#$ -t 1-200
+#$ -q YOUR_QUEUE
 
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# For conda environments, replace the two lines above with:
+#   CONDA_SH="/path/to/conda/etc/profile.d/conda.sh"
+#   CONDA_ENV="pydfc"
+# -----------------------------------------------------------
+
+SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
+TASKS_INFO="./tasks_info.json"
+
+SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+echo "Subject ID: $SUBJECT_ID"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
+# Activate virtual environment
+source "$VENV_PATH"
+# For conda: source "$CONDA_SH" && conda activate "$CONDA_ENV"
 
-python "/data/origami/dFC/CODEs/pydfc/dFC/simul_dFC/task_data_simulator.py" \
---dataset_info $DATASET_INFO
+# Run Python script
+python "$PYDFC_CODE_DIR/simul_dFC/task_data_simulator.py" \
+--dataset_info $DATASET_INFO \
+--tasks_info $TASKS_INFO \
+--participant_id $SUBJECT_ID
 
-conda deactivate
+# Deactivate environment
+deactivate
diff --git a/simul_dFC/run_scripts_slurm/run_simulator.sh b/simul_dFC/run_scripts_slurm/run_simulator.sh
index ccb4065..9dd5c5a 100644
--- a/simul_dFC/run_scripts_slurm/run_simulator.sh
+++ b/simul_dFC/run_scripts_slurm/run_simulator.sh
@@ -3,7 +3,7 @@
 #SBATCH --job-name=simul_dfc_job   # Optional: Name of your job
 #SBATCH --output=logs/simul_out.txt  # Standard output log
 #SBATCH --error=logs/simul_err.txt   # Standard error log
-#SBATCH --account=def-jbpoline           # Account
+#SBATCH --account=YOUR_ACCOUNT           # Account
 #SBATCH --time=24:00:00                # Walltime for each task (24 hours)
 #SBATCH --mem=8G                     # Memory request per node
 #SBATCH --array=1-200                # Task array specification
@@ -15,11 +15,16 @@ TASKS_INFO="./tasks_info.json"
 SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
-# Activate  virtual environment
-source "/home/mt00/venvs/pydfc/bin/activate"
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
+
+# Activate virtual environment
+source "$VENV_PATH"
 
 # Run Python script
-python "/home/mt00/pydfc/dFC/simul_dFC/task_data_simulator.py" \
+python "$PYDFC_CODE_DIR/simul_dFC/task_data_simulator.py" \
 --dataset_info $DATASET_INFO \
 --tasks_info $TASKS_INFO \
 --participant_id $SUBJECT_ID
diff --git a/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json b/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
index 01d3a0e..81d4f88 100644
--- a/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
+++ b/simul_dFC/run_scripts_slurm/tasks_info_ds003465.json
@@ -1,7 +1,7 @@
 {
     "task-Axcpt": {
         "task_name": "task-Axcpt",
-        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Axcpt_run-1_task-data.npy",
+        "task_data": "/path/to/your/data/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Axcpt_run-1_task-data.npy",
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
@@ -11,7 +11,7 @@
     },
     "task-Cuedts": {
         "task_name": "task-Cuedts",
-        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Cuedts_run-1_task-data.npy",
+        "task_data": "/path/to/your/data/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Cuedts_run-1_task-data.npy",
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
@@ -21,7 +21,7 @@
     },
     "task-Stern": {
         "task_name": "task-Stern",
-        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Stern_run-1_task-data.npy",
+        "task_data": "/path/to/your/data/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Stern_run-1_task-data.npy",
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
@@ -31,7 +31,7 @@
     },
     "task-Stroop": {
         "task_name": "task-Stroop",
-        "task_data": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Stroop_run-1_task-data.npy",
+        "task_data": "/path/to/your/data/ds003465/derivatives/ROI_timeseries/{subj_id}/ses-wave1bas/{subj_id}_ses-wave1bas_task-Stroop_run-1_task-data.npy",
         "TAVG_period": 1.0,
         "num_stimulated_regions": 5,
         "global_conn_coupling_coef": 0.0126,
diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index 67ab300..dd78b7c 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -8,73 +8,126 @@ PydFC: task_dFC Module Documentation
 
 The ``task_dFC`` module provides a scalable, open-source Python solution for the **large-scale benchmarking and application of dynamic functional connectivity (dFC) methods**.
 
-Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting moment-to-moment cognitive states**-specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
+Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting moment-to-moment cognitive states** — specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
 
 Methods Implemented
 -------------------
 
-The module supports a diverse selection of seven well-established dFC methodologies implemented within the PydFC toolbox :
+The module supports a diverse selection of seven well-established dFC methodologies implemented within the PydFC toolbox:
 
-*   **State-free Methods:** Designed to capture continuous fluctuations in connectivity .
+*   **State-free Methods:** Designed to capture continuous fluctuations in connectivity.
 
-    *   Sliding Window (SW).
-    *   Time-Frequency (TF).
+    *   Sliding Window (SW)
+    *   Time-Frequency (TF)
 
-*   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states .
+*   **State-based Methods:** Designed to identify recurring, discrete connectivity patterns or states.
+
+    *   Co-Activation Patterns (CAP)
+    *   Clustering (SWC)
+    *   Continuous Hidden Markov Models (CHMM)
+    *   Discrete Hidden Markov Models (DHMM)
+    *   Windowless (WL)
+
+Prerequisites
+-------------
+
+*   Preprocessed fMRI data in BIDS format with ``events.tsv`` files (e.g., via fMRIPrep).
+*   PydFC installed (see the root ``README.rst``).
+*   For fMRIPrep preprocessing: ``nipoppy`` installed and configured.
+
+Configuration Files
+-------------------
+
+Before running the pipeline, fill in the following JSON configuration files located in ``run_scripts_slurm/`` or ``run_scripts_sge/``:
+
+*   ``dataset_info.json`` — dataset name, root paths, sessions, tasks, runs, and BOLD suffix.
+*   ``methods_config.json`` — dFC method parameters, method list, and parallelism settings.
+*   ``multi_dataset_info.json`` — paths and dataset lists for cross-dataset analysis.
+*   ``global_config.json`` — nipoppy configuration for fMRIPrep (containers, FreeSurfer license, TemplateFlow).
 
-    *   Co-Activation Patterns (CAP).
-    *   Clustering (SWC).
-    *   Continuous Hidden Markov Models (CHMM).
-    *   Discrete Hidden Markov Models (DHMM).
-    *   Windowless (WL).
 Analysis Pipeline: Script-Based Workflow
----------------------------------------
+-----------------------------------------
 
-The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``events.tsv``) has undergone standard preprocessing (via fMRIprep) . The subsequent analysis is executed sequentially through the following scripts:
+The ``task_dFC`` workflow starts assuming that fMRI data (in BIDS format with ``events.tsv``) has undergone standard preprocessing (via fMRIPrep). The subsequent analysis is executed sequentially through the following scripts:
 
 1. ``nifti_to_roi_signal.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Runs denoising and extracts regional BOLD time series from preprocessed NIfTI data .
+**Function:** Runs denoising and extracts regional BOLD time series from preprocessed NIfTI data.
 
 **Details:** Voxel-wise BOLD signals are parcellated, typically using an atlas such as the Schaefer 100-region atlas, yielding regional time series that serve as the input for dFC assessment.
 
-1. ``FCS_estimate.py``
+2. ``FCS_estimate.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Estimates Functional Connectivity States (FCS) .
+**Function:** Estimates Functional Connectivity States (FCS).
 
-**Details:** This script fits the dFC model required by **state-based methodologies** (CAP, HMM, Clustering) that rely on identifying **group-level recurring patterns** .
+**Details:** This script fits the dFC model required by **state-based methodologies** (CAP, HMM, Clustering) that rely on identifying **group-level recurring patterns**. Must be run before ``dFC_assessment.py``.
 
-1. ``dFC_assessment.py``
+3. ``dFC_assessment.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Computes individual-level dFC patterns .
+**Function:** Computes individual-level dFC patterns.
 
-**Details:** The script applies the seven implemented dFC methodologies (SW, TF, CAP, etc.) to the BOLD signals of each run and subject to obtain the corresponding high-dimensional dFC patterns .
+**Details:** Applies the seven implemented dFC methodologies (SW, TF, CAP, etc.) to the BOLD signals of each run and subject to obtain the corresponding high-dimensional dFC patterns.
 
-1. ``ML.py``
+4. ``ML.py``
 ~~~~~~~~~~~~~~~~~~~~
-**Function:** Implements the core machine learning pipeline, including cognitive state labeling, feature extraction, supervised classification, and separability analysis .
+**Function:** Implements the core machine learning pipeline, including cognitive state labeling, feature extraction, supervised classification, and separability analysis.
 
 **A. Task Presence Labeling**
-*   Initial stimulus timings from ``events.tsv`` are convolved with a canonical **Hemodynamic Response Function (HRF)** to account for hemodynamic delay .
-*   The HRF-convolved signal is binarized using a **Gaussian Mixture Model (GMM)** to assign time points as "rest" or "task-present" . This process critically identifies and removes ambiguous **"gray zone" time points** corresponding to transitions, improving classifier performance .
+
+*   Initial stimulus timings from ``events.tsv`` are convolved with a canonical **Hemodynamic Response Function (HRF)** to account for hemodynamic delay.
+*   The HRF-convolved signal is binarized using a **Gaussian Mixture Model (GMM)** to assign time points as "rest" or "task-present". This process critically identifies and removes ambiguous **"gray zone" time points** corresponding to transitions, improving classifier performance.
 
 **B. Feature Extraction and Reduction**
-*   **State-free Methods (SW, TF):** DFC matrices are vectorized (e.g., 4950 connections for Schaefer 100-region atlas) . **Laplacian Eigenmaps (LE)** dimensionality reduction is applied to make the high-dimensional discriminative information accessible to classifiers .
-*   **State-based Methods (CAP, HMM, etc.):** Features are derived from state probabilities, distances from states, or state weights . These resulting compositional features (shape (time, states)) are transformed using an **isometric log-ratio (ILR) transformation** .
+
+*   **State-free Methods (SW, TF):** DFC matrices are vectorized (e.g., 4950 connections for Schaefer 100-region atlas). **Laplacian Eigenmaps (LE)** dimensionality reduction is applied to make the high-dimensional discriminative information accessible to classifiers.
+*   **State-based Methods (CAP, HMM, etc.):** Features are derived from state probabilities, distances from states, or state weights. These resulting compositional features (shape ``(time, states)``) are transformed using an **isometric log-ratio (ILR) transformation**.
 
 **C. Prediction and Evaluation**
-*   A **Support Vector Machine (SVM) with an RBF kernel** is trained to predict the cognitive state (rest vs. task) at the single-TR level .
-*   **Balanced Accuracy** is used as the primary metric, ensuring chance performance is 50% .
-*   **Cognitive State Separability** is quantified using the **Silhouette Index (SI)** to evaluate whether task and rest samples are intrinsically distinguishable in the feature space without supervision .
 
-1. ``generate_report.py``
+*   A **Support Vector Machine (SVM) with an RBF kernel** is trained to predict the cognitive state (rest vs. task) at the single-TR level.
+*   **Balanced Accuracy** is used as the primary metric, ensuring chance performance is 50%.
+*   **Cognitive State Separability** is quantified using the **Silhouette Index (SI)** to evaluate whether task and rest samples are intrinsically distinguishable in the feature space without supervision.
+
+5. ``generate_report.py``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Summarizes classification efficacy and separability results for individual datasets and paradigms .
+**Function:** Summarizes classification efficacy and separability results for individual datasets and paradigms.
 
 **Details:** Generates figures, tables, and reports (e.g., heatmaps and boxplots) documenting Balanced Accuracy and SI scores across methods and paradigms.
 
-6. ``multi_dataset_analysis``
+6. ``multi_dataset_analysis/``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-**Function:** Contains scripts for aggregating and comparing results across multiple datasets and paradigms.
+**Function:** Aggregates and compares results across multiple datasets and paradigms.
+
+**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics across datasets and task paradigms. Run via ``run_scripts_slurm/run_across_dataset_analysis.sh``.
+
+Available scripts:
+
+*   ``performance_predict.py`` — prediction accuracy across datasets and methods
+*   ``performance_factor.py`` — factors driving performance differences
+*   ``ml_results.py`` — ML pipeline result summaries
+*   ``dfc_visualization.py`` — dFC pattern visualizations
+*   ``embedding_visualization.py`` — low-dimensional embedding visualizations
+*   ``sample_matrix_visualization.py`` — sample dFC matrix plots
+*   ``task_presence_binarization.py`` — task label binarization diagnostics
+*   ``task_timing_stats.py`` — task timing statistics
+*   ``cohensd.py`` — effect size analysis
+
+Running the Pipeline
+--------------------
+
+Cluster job scripts are provided in two formats:
+
+*   ``run_scripts_slurm/`` — for SLURM-based clusters (e.g., Compute Canada / Alliance)
+*   ``run_scripts_sge/`` — for SGE-based clusters
+
+Each script contains a **"Cluster configuration"** block at the top where you set your virtual environment path and pydfc code directory before submitting. Array jobs (``run_dFC.sh``, ``run_nifti_to_roi.sh``, ``run_fmriprep.sh``) expect a ``subj_list.txt`` file listing one subject ID per line.
+
+Typical submission order::
 
-**Details:** Facilitates **large-scale benchmarking** by calculating aggregate performance statistics (e.g., accuracy distribution) across datasets and task paradigms analyzed.
+    sbatch run_fmriprep.sh         # (optional) if preprocessing not done
+    sbatch run_nifti_to_roi.sh
+    sbatch run_FCS.sh
+    sbatch --array=1-N run_dFC.sh
+    sbatch run_ML.sh
+    sbatch run_report.sh
+    sbatch run_across_dataset_analysis.sh <script_name>
diff --git a/task_dFC/run_scripts_sge/dataset_info.json b/task_dFC/run_scripts_sge/dataset_info.json
index 30531e6..b01dbda 100644
--- a/task_dFC/run_scripts_sge/dataset_info.json
+++ b/task_dFC/run_scripts_sge/dataset_info.json
@@ -1,15 +1,13 @@
 {
 	"dataset" : "",
-	"main_root" : "/data/origami/dFC/DATA/task-based/openneuro/{dataset}",
-	"bids_root" : "{main_root}/bids",
-	"fmriprep_root" : "{main_root}/derivatives/fmriprep/23.1.3/output",
+	"main_root" : "/path/to/your/data/{dataset}",
+	"bids_root" : "/path/to/your/data/{dataset}/bids",
+	"fmriprep_root" : "/path/to/your/data/{dataset}/derivatives/fmriprep/23.1.3/output",
 	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
 	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
 	"ML_root" : "{main_root}/derivatives/ML",
 	"reports_root" : "{main_root}/derivatives/reports",
-	"trial_type_label" : "trial_type",
-	"rest_labels" : ["rest", "Rest"],
 	"bold_suffix" : "_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz",
 	"SESSIONS" : [
 		"ses-1"
@@ -19,5 +17,11 @@
 	],
 	"RUNS" : {
     		"task-A": ["run-01", "run-02", "run-03", "run-04", "run-05", "run-06"]
+	},
+	"trial_type_label" : {
+			"task-A": "trial_type"
+	},
+	"rest_labels" : {
+			"task-A": ["rest", "Rest"]
 	}
 }
diff --git a/task_dFC/run_scripts_sge/global_configs.json b/task_dFC/run_scripts_sge/global_configs.json
index ada5894..04d15c4 100644
--- a/task_dFC/run_scripts_sge/global_configs.json
+++ b/task_dFC/run_scripts_sge/global_configs.json
@@ -1,54 +1,170 @@
 {
-    "DATASET_NAME": "",
-    "DATASET_ROOT": "/data/origami/dFC/DATA/task-based/openneuro//",
-
-    "CONTAINER_STORE": "/data/origami/container_store/nipoppy/",
-
-    "SINGULARITY_PATH": "singularity",
-
-    "TEMPLATEFLOW_DIR": "/data/origami/templateflow",
-
-    "SESSIONS": [],
-    "VISITS": [],
-
-    "BIDS": {
-        "heudiconv": {
-            "VERSION": "0.11.6",
-            "CONTAINER": "heudiconv_{}.sif",
-            "URL": ""
-        },
-        "validator":{
-            "CONTAINER": "bids_validator.sif",
-            "URL": ""
-
-        }
+    "DATASET_NAME": "<DATASET_NAME>",
+    "VISIT_IDS": [
+        "<VISIT_LABEL>",
+        "<OTHER_VISIT_LABEL>"
+    ],
+    "SESSION_IDS": [
+        "<SESSION_LABEL>",
+        "<OTHER_SESSION_LABEL>"
+    ],
+    "SUBSTITUTIONS": {
+        "[[NIPOPPY_DPATH_CONTAINERS]]": "/path/to/your/container_store/nipoppy",
+        "[[HEUDICONV_HEURISTIC_FILE]]": "",
+        "[[DCM2BIDS_CONFIG_FILE]]": "",
+        "[[FREESURFER_LICENSE_FILE]]": "/path/to/your/freesurfer/license.txt",
+        "[[TEMPLATEFLOW_HOME]]": "/path/to/your/templateflow"
     },
-
-    "PROC_PIPELINES": {
-        "mriqc": {
-            "VERSION": "23.1.0",
-            "CONTAINER": "mriqc_{}.sif",
-            "URL": ""
+    "DICOM_DIR_PARTICIPANT_FIRST": true,
+    "CONTAINER_CONFIG": {
+        "COMMAND": "apptainer",
+        "ARGS": [
+            "--cleanenv"
+        ]
+    },
+    "BIDS_PIPELINES": [
+        {
+            "NAME": "heudiconv",
+            "VERSION": "0.12.2",
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://nipy/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "STEPS": [
+                {
+                    "NAME": "prepare",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json"
+                },
+                {
+                    "NAME": "convert",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json",
+                    "CONTAINER_CONFIG": {
+                        "ARGS": [
+                            "--bind",
+                            "[[HEUDICONV_HEURISTIC_FILE]]"
+                        ]
+                    },
+                    "UPDATE_DOUGHNUT": true
+                }
+            ]
+        },
+        {
+            "NAME": "dcm2bids",
+            "VERSION": "3.1.0",
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://unfmontreal/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "STEPS": [
+                {
+                    "NAME": "prepare",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-dcm2bids_helper.json"
+                },
+                {
+                    "NAME": "convert",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-dcm2bids.json",
+                    "CONTAINER_CONFIG": {
+                        "ARGS": [
+                            "--bind",
+                            "[[DCM2BIDS_CONFIG_FILE]]"
+                        ]
+                    },
+                    "UPDATE_DOUGHNUT": true
+                }
+            ]
         },
-        "fmriprep": {
+        {
+            "NAME": "bidscoin",
+            "VERSION": "4.3.2",
+            "STEPS": [
+                {
+                    "NAME": "prepare",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-bidsmapper.json",
+                    "ANALYSIS_LEVEL": "group"
+                },
+                {
+                    "NAME": "edit",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-bidseditor.json",
+                    "ANALYSIS_LEVEL": "group"
+                },
+                {
+                    "NAME": "convert",
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation-[[STEP_NAME]].json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor-bidscoiner.json",
+                    "ANALYSIS_LEVEL": "participant",
+                    "UPDATE_DOUGHNUT": true
+                }
+            ]
+        }
+    ],
+    "PROC_PIPELINES": [
+        {
+            "NAME": "fmriprep",
             "VERSION": "23.1.3",
-            "CONTAINER": "fmriprep_{}.sif",
-            "URL": ""
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://nipreps/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "CONTAINER_CONFIG": {
+                "ENV_VARS": {
+                    "TEMPLATEFLOW_HOME": "[[TEMPLATEFLOW_HOME]]"
+                },
+                "ARGS": [
+                    "--bind",
+                    "[[FREESURFER_LICENSE_FILE]]",
+                    "--bind",
+                    "[[TEMPLATEFLOW_HOME]]"
+                ]
+            },
+            "STEPS": [
+                {
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation.json",
+                    "GENERATE_PYBIDS_DATABASE": false,
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json",
+                    "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/tracker_config.json"
+                }
+            ]
         },
-        "freesurfer": {
+        {
+            "NAME": "freesurfer",
             "VERSION": "7.3.2",
-            "CONTAINER": "fmriprep_{}.sif",
-            "URL": ""
-        }
-    },
-
-    "TABULAR": {
-        "data_dictionary": {
-            "PATH": "",
-            "VERSION": "",
-            "URL": ""
+            "DESCRIPTION": "Freesurfer version associated with fMRIPrep 23.1.3",
+            "STEPS": [
+                {
+                    "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/tracker_config.json"
+                }
+            ]
+        },
+        {
+            "NAME": "mriqc",
+            "VERSION": "23.1.0",
+            "CONTAINER_INFO": {
+                "FILE": "[[NIPOPPY_DPATH_CONTAINERS]]/[[PIPELINE_NAME]]_[[PIPELINE_VERSION]].sif",
+                "URI": "docker://nipreps/[[PIPELINE_NAME]]:[[PIPELINE_VERSION]]"
+            },
+            "CONTAINER_CONFIG": {
+                "ENV_VARS": {
+                    "TEMPLATEFLOW_HOME": "[[TEMPLATEFLOW_HOME]]"
+                },
+                "ARGS": [
+                    "--bind",
+                    "[[TEMPLATEFLOW_HOME]]"
+                ]
+            },
+            "STEPS": [
+                {
+                    "INVOCATION_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/invocation.json",
+                    "DESCRIPTOR_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/descriptor.json",
+                    "TRACKER_CONFIG_FILE": "[[NIPOPPY_DPATH_PIPELINES]]/[[PIPELINE_NAME]]-[[PIPELINE_VERSION]]/tracker_config.json"
+                }
+            ]
         }
-    },
-
-    "WORKFLOWS": []
+    ],
+    "CUSTOM": {}
 }
diff --git a/task_dFC/run_scripts_sge/methods_config.json b/task_dFC/run_scripts_sge/methods_config.json
index ee96381..722b4ff 100644
--- a/task_dFC/run_scripts_sge/methods_config.json
+++ b/task_dFC/run_scripts_sge/methods_config.json
@@ -1,18 +1,23 @@
 {
     "params_methods" : {
-        "W": 12,
+        "W": 44,
         "n_overlap": 1.0,
         "sw_method": "pear_corr",
         "tapered_window": true,
         "TF_method": "WTC",
         "clstr_base_measure": "SlidingWindow",
+        "clstr_distance": "manhattan",
         "hmm_iter": 20,
         "dhmm_obs_state_ratio": 0.666,
-        "n_states": 12,
+        "n_states": 5,
         "n_subj_clstrs": 10,
-        "n_jobs": 2,
         "verbose": 0,
-        "backend": "loky",
+        "n_jobs_sw": 8,
+        "backend_sw": "threading",
+        "n_jobs_tf": 2,
+        "backend_tf": "loky",
+        "n_jobs_swc": null,
+        "backend_swc": null,
         "normalization": true,
         "num_subj": null,
         "num_time_point": null
diff --git a/task_dFC/run_scripts_sge/multi_dataset_info.json b/task_dFC/run_scripts_sge/multi_dataset_info.json
new file mode 100644
index 0000000..de0cf2b
--- /dev/null
+++ b/task_dFC/run_scripts_sge/multi_dataset_info.json
@@ -0,0 +1,38 @@
+{
+	"output_root": "/path/to/your/data/multi_dataset_analysis/results",
+	"real_data": {
+		"main_root": "/path/to/your/data/openneuro",
+		"DATASETS": [
+			"ds001242", "ds002236", "ds002647",
+			"ds002843", "ds002994",
+			"ds003465", "ds003612", "ds003823",
+			"ds004044", "ds004349", "ds004359",
+			"ds004556", "ds004746", "ds004791",
+			"ds004848", "ds005038"
+		],
+		"TASKS_to_include": [
+			"task-arithmetic", "task-AudSem", "task-Axcpt",
+			"task-Cuedts", "task-emotionRegulation", "task-execution","task-expo",
+			"task-fearlearning", "task-feedback", "task-fribBids", "task-IHG",
+			"task-imagery", "task-itc", "task-localiser", "task-Localizer",
+			"task-matching", "task-motor", "task-paingen", "task-ppalocalizer",
+			"task-recall", "task-risk", "task-ST", "task-Stern",
+			"task-Stroop", "task-VisRhyme", "task-VisSem", "task-VisSpell",
+			"task-vswm"
+    	]
+	},
+	"simulated_data": {
+		"main_root": "/path/to/your/data/simulated",
+		"DATASETS": [
+			"ds000001", "ds000002", "ds000003", "ds000004", "ds000005", "ds000006"
+		],
+		"TASKS_to_include": [
+			"task-Axcpt", "task-Cuedts", "task-Stern", "task-Stroop",
+			"task-lowFreqLongRest", "task-lowFreqShortRest", "task-lowFreqShortTask",
+			"task-imagery", "task-execution",
+			"task-itc", "task-risk",
+			"task-Localizer",
+			"task-ppalocalizer"
+		]
+	}
+}
diff --git a/task_dFC/run_scripts_sge/run_FCS.sh b/task_dFC/run_scripts_sge/run_FCS.sh
index a84c578..9601a5c 100644
--- a/task_dFC/run_scripts_sge/run_FCS.sh
+++ b/task_dFC/run_scripts_sge/run_FCS.sh
@@ -1,18 +1,35 @@
 #!/bin/sh
 #
-#$ -cwd
+#$ -N fit_fcs_job
 #$ -o logs/fcs_out.txt
 #$ -e logs/fcs_err.txt
-#$ -l h_vmem=64G
-#$ -q origami.q
+#$ -l h_rt=168:00:00
+#$ -pe smp 8
+#$ -l h_vmem=8g
+#$ -q YOUR_QUEUE
+
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# For conda environments, replace the two lines above with:
+#   CONDA_SH="/path/to/conda/etc/profile.d/conda.sh"
+#   CONDA_ENV="pydfc"
+# -----------------------------------------------------------
 
 DATASET_INFO="./dataset_info.json"
 METHODS_CONFIG="./methods_config.json"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/FCS_estimate.py" \
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export OPENBLAS_NUM_THREADS=1
+export NUMEXPR_NUM_THREADS=1
+
+# Activate virtual environment
+source "$VENV_PATH"
+# For conda: source "$CONDA_SH" && conda activate "$CONDA_ENV"
+
+python "$PYDFC_CODE_DIR/task_dFC/FCS_estimate.py" \
 --dataset_info $DATASET_INFO \
 --methods_config $METHODS_CONFIG
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_sge/run_ML.sh b/task_dFC/run_scripts_sge/run_ML.sh
index 4ec431a..2cc1339 100644
--- a/task_dFC/run_scripts_sge/run_ML.sh
+++ b/task_dFC/run_scripts_sge/run_ML.sh
@@ -1,16 +1,23 @@
 #!/bin/sh
 #
-#$ -cwd
+#$ -N ml_job
 #$ -o logs/ML_out.txt
 #$ -e logs/ML_err.txt
-#$ -l h_vmem=64G
-#$ -q origami.q
+#$ -pe smp 8
+#$ -l h_vmem=16g
+#$ -q YOUR_QUEUE
+
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
 DATASET_INFO="./dataset_info.json"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/ML.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/ML.py" \
 --dataset_info $DATASET_INFO
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_sge/run_across_dataset_analysis.sh b/task_dFC/run_scripts_sge/run_across_dataset_analysis.sh
new file mode 100644
index 0000000..34a0fc8
--- /dev/null
+++ b/task_dFC/run_scripts_sge/run_across_dataset_analysis.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+#
+#$ -N across_dataset_analysis
+#$ -o logs/across_dataset_analysis_out.txt
+#$ -e logs/across_dataset_analysis_err.txt
+#$ -l h_rt=05:00:00
+#$ -l h_vmem=32g
+#$ -q YOUR_QUEUE
+
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
+
+set -euo pipefail
+
+mkdir -p logs
+source "$VENV_PATH"
+
+MULTI_DATASET_INFO="$PYDFC_CODE_DIR/task_dFC/run_scripts_sge/multi_dataset_info.json"
+
+SCRIPT_NAME=${1:-}
+SIMUL_OR_REAL=${2:-real}
+SCRIPT_DIR="$PYDFC_CODE_DIR/task_dFC/multi_dataset_analysis"
+SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_NAME"
+
+if [ -z "$SCRIPT_NAME" ]; then
+    echo "Usage: qsub run_across_dataset_analysis.sh <script_name> [real|simulated]"
+    exit 1
+fi
+
+if [ ! -f "$SCRIPT_PATH" ]; then
+    echo "Error: Script '$SCRIPT_PATH' not found."
+    exit 1
+fi
+
+case "$SCRIPT_NAME" in
+  performance_predict.py | performance_factor.py | ml_results.py | dfc_visualization.py | embedding_visualization.py | sample_matrix_visualization.py | task_presence_binarization.py | task_timing_stats.py | cohensd.py)
+    python "$SCRIPT_PATH" --multi_dataset_info "$MULTI_DATASET_INFO" --simul_or_real "$SIMUL_OR_REAL"
+    ;;
+  *)
+    echo "Unknown script: $SCRIPT_NAME"
+    exit 1
+    ;;
+esac
+
+deactivate
diff --git a/task_dFC/run_scripts_sge/run_dFC.sh b/task_dFC/run_scripts_sge/run_dFC.sh
index 124dc1f..463fbcf 100644
--- a/task_dFC/run_scripts_sge/run_dFC.sh
+++ b/task_dFC/run_scripts_sge/run_dFC.sh
@@ -1,23 +1,33 @@
 #!/bin/sh
 #
-#$ -cwd
+#$ -N assess_dfc_job
 #$ -o logs/dfc_out.txt
 #$ -e logs/dfc_err.txt
-#$ -l h_vmem=32G
-#$ -q origami.q
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=32g
+#$ -t 1-NSUBJECTS
+#$ -q YOUR_QUEUE
+
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
+METHODS_CONFIG="./methods_config.json"
 
 echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 
 SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/dFC_assessment.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/dFC_assessment.py" \
 --dataset_info $DATASET_INFO \
+--methods_config $METHODS_CONFIG \
 --participant_id $SUBJECT_ID
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_sge/run_fmriprep.sh b/task_dFC/run_scripts_sge/run_fmriprep.sh
index ada2813..075b581 100644
--- a/task_dFC/run_scripts_sge/run_fmriprep.sh
+++ b/task_dFC/run_scripts_sge/run_fmriprep.sh
@@ -1,26 +1,31 @@
 #!/bin/bash
 #
-#$ -cwd
+#$ -N fmriprep_job
 #$ -o logs/fmriprep_out.log
 #$ -e logs/fmriprep_err.log
-#$ -l h_rt=24:00:00
-#$ -l h_vmem=32G
-#$ -q origami.q
+#$ -l h_vmem=16g
+#$ -pe smp 8
+#$ -t 1-NSUBJECTS
+#$ -q YOUR_QUEUE
 
-# TODO replace with local paths
-source "/data/origami/dFC/anaconda3/etc/profile.d/conda.sh"
-conda activate nipoppy_env
+# ---- Cluster configuration (set these for your system) ----
+NIPOPPY_VENV_PATH="/path/to/your/nipoppy_venv/bin/activate"
+# -----------------------------------------------------------
+
+module load apptainer
+
+source "$NIPOPPY_VENV_PATH"
 
 SUBJECT_LIST="./subj_list.txt"
-GLOBAL_CONFIG="../proc/global_configs.json"
 
-echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+echo "Number subjects found: $(wc -l < $SUBJECT_LIST)"
 
-SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+SUBJECT_ID=$(sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST)
 echo "Subject ID: $SUBJECT_ID"
 
-python "/data/origami/dFC/CODEs/nipoppy/nipoppy/workflow/proc_pipe/fmriprep/run_fmriprep.py" \
---global_config $GLOBAL_CONFIG \
---participant_id $SUBJECT_ID
+nipoppy run \
+"$(dirname "$(pwd)")" \
+--pipeline fmriprep \
+--participant-id $SUBJECT_ID
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_sge/run_nifti_to_roi.sh b/task_dFC/run_scripts_sge/run_nifti_to_roi.sh
index 1fff1da..95c4b73 100644
--- a/task_dFC/run_scripts_sge/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_sge/run_nifti_to_roi.sh
@@ -1,23 +1,39 @@
 #!/bin/sh
 #
-#$ -cwd
+#$ -N extract_roi_job
 #$ -o logs/roi_out.txt
 #$ -e logs/roi_err.txt
-#$ -l h_vmem=32G
-#$ -q origami.q
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=64g
+#$ -t 1-NSUBJECTS
+#$ -q YOUR_QUEUE
 
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
+
+# -----------------------------
+# Inputs
+# -----------------------------
 SUBJECT_LIST="./subj_list.txt"
 DATASET_INFO="./dataset_info.json"
+DENOISING_STRATEGY=${1:-simple}
 
-echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
+echo "Denoising strategy: $DENOISING_STRATEGY"
+echo "Number of subjects: $(wc -l < "$SUBJECT_LIST")"
 
-SUBJECT_ID=`sed -n "${SGE_TASK_ID}p" $SUBJECT_LIST`
+SUBJECT_ID=$(sed -n "${SGE_TASK_ID}p" "$SUBJECT_LIST")
 echo "Subject ID: $SUBJECT_ID"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
---dataset_info $DATASET_INFO \
---participant_id $SUBJECT_ID
+# -----------------------------
+# Environment
+# -----------------------------
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/nifti_to_roi_signal.py" \
+    --dataset_info $DATASET_INFO \
+    --participant_id $SUBJECT_ID \
+    --denoising_strategy $DENOISING_STRATEGY
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_sge/run_report.sh b/task_dFC/run_scripts_sge/run_report.sh
index 2a00cc5..a1c13b4 100644
--- a/task_dFC/run_scripts_sge/run_report.sh
+++ b/task_dFC/run_scripts_sge/run_report.sh
@@ -1,18 +1,25 @@
 #!/bin/sh
 #
-#$ -cwd
+#$ -N report_job
 #$ -o logs/report_out.txt
 #$ -e logs/report_err.txt
-#$ -l h_vmem=16G
-#$ -q origami.q
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=64g
+#$ -q YOUR_QUEUE
+
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
 DATASET_INFO="./dataset_info.json"
 SUBJ_LIST="./subj_list.txt"
 
-source /data/origami/dFC/anaconda3/etc/profile.d/conda.sh
-conda activate pydfc
-python "/data/origami/dFC/CODEs/pydfc/dFC/task_dFC/generate_report.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/generate_report.py" \
 --dataset_info $DATASET_INFO \
 --subj_list $SUBJ_LIST
 
-conda deactivate
+deactivate
diff --git a/task_dFC/run_scripts_slurm/dataset_info.json b/task_dFC/run_scripts_slurm/dataset_info.json
index c975277..b01dbda 100644
--- a/task_dFC/run_scripts_slurm/dataset_info.json
+++ b/task_dFC/run_scripts_slurm/dataset_info.json
@@ -1,8 +1,8 @@
 {
 	"dataset" : "",
-	"main_root" : "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/{dataset}",
-	"bids_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/bids",
-	"fmriprep_root" : "/home/mt00/scratch/DATA/task-based/openneuro/{dataset}/derivatives/fmriprep/23.1.3/output",
+	"main_root" : "/path/to/your/data/{dataset}",
+	"bids_root" : "/path/to/your/data/{dataset}/bids",
+	"fmriprep_root" : "/path/to/your/data/{dataset}/derivatives/fmriprep/23.1.3/output",
 	"roi_root" : "{main_root}/derivatives/ROI_timeseries",
 	"fitted_measures_root" : "{main_root}/derivatives/fitted_MEASURES",
 	"dFC_root" : "{main_root}/derivatives/dFC_assessed",
diff --git a/task_dFC/run_scripts_slurm/global_config.json b/task_dFC/run_scripts_slurm/global_config.json
index d818ab3..04d15c4 100644
--- a/task_dFC/run_scripts_slurm/global_config.json
+++ b/task_dFC/run_scripts_slurm/global_config.json
@@ -9,11 +9,11 @@
         "<OTHER_SESSION_LABEL>"
     ],
     "SUBSTITUTIONS": {
-        "[[NIPOPPY_DPATH_CONTAINERS]]": "/home/mt00/projects/def-jbpoline/container_store/nipoppy",
+        "[[NIPOPPY_DPATH_CONTAINERS]]": "/path/to/your/container_store/nipoppy",
         "[[HEUDICONV_HEURISTIC_FILE]]": "",
         "[[DCM2BIDS_CONFIG_FILE]]": "",
-        "[[FREESURFER_LICENSE_FILE]]": "/home/mt00/projects/def-jbpoline/mt00/freesurfer/license.txt",
-        "[[TEMPLATEFLOW_HOME]]": "/home/mt00/projects/def-jbpoline/templateflow"
+        "[[FREESURFER_LICENSE_FILE]]": "/path/to/your/freesurfer/license.txt",
+        "[[TEMPLATEFLOW_HOME]]": "/path/to/your/templateflow"
     },
     "DICOM_DIR_PARTICIPANT_FIRST": true,
     "CONTAINER_CONFIG": {
diff --git a/task_dFC/run_scripts_slurm/multi_dataset_info.json b/task_dFC/run_scripts_slurm/multi_dataset_info.json
index fd4b892..de0cf2b 100644
--- a/task_dFC/run_scripts_slurm/multi_dataset_info.json
+++ b/task_dFC/run_scripts_slurm/multi_dataset_info.json
@@ -1,7 +1,7 @@
 {
-	"output_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/multi_dataset_analysis/results",
+	"output_root": "/path/to/your/data/multi_dataset_analysis/results",
 	"real_data": {
-		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro",
+		"main_root": "/path/to/your/data/openneuro",
 		"DATASETS": [
 			"ds001242", "ds002236", "ds002647",
 			"ds002843", "ds002994",
@@ -22,7 +22,7 @@
     	]
 	},
 	"simulated_data": {
-		"main_root": "/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/simulated",
+		"main_root": "/path/to/your/data/simulated",
 		"DATASETS": [
 			"ds000001", "ds000002", "ds000003", "ds000004", "ds000005", "ds000006"
 		],
diff --git a/task_dFC/run_scripts_slurm/run_FCS.sh b/task_dFC/run_scripts_slurm/run_FCS.sh
index fce086b..a0d27bf 100644
--- a/task_dFC/run_scripts_slurm/run_FCS.sh
+++ b/task_dFC/run_scripts_slurm/run_FCS.sh
@@ -15,10 +15,15 @@ export MKL_NUM_THREADS=1
 export OPENBLAS_NUM_THREADS=1
 export NUMEXPR_NUM_THREADS=1
 
-# Activate  virtual environment
-source "/home/mt00/venvs/pydfc/bin/activate"
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
-python "/home/mt00/pydfc/dFC/task_dFC/FCS_estimate.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/FCS_estimate.py" \
 --dataset_info $DATASET_INFO \
 --methods_config $METHODS_CONFIG
 
diff --git a/task_dFC/run_scripts_slurm/run_ML.sh b/task_dFC/run_scripts_slurm/run_ML.sh
index fd0632b..623d298 100644
--- a/task_dFC/run_scripts_slurm/run_ML.sh
+++ b/task_dFC/run_scripts_slurm/run_ML.sh
@@ -7,10 +7,15 @@
 
 DATASET_INFO="./dataset_info.json"
 
-# Activate  virtual environment
-source "/home/mt00/venvs/pydfc/bin/activate"
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
-python "/home/mt00/pydfc/dFC/task_dFC/ML.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/ML.py" \
 --dataset_info $DATASET_INFO
 
 deactivate
diff --git a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
index 8c77aef..1d150bb 100644
--- a/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
+++ b/task_dFC/run_scripts_slurm/run_across_dataset_analysis.sh
@@ -5,18 +5,24 @@
 #SBATCH --error=logs/%x_err.txt
 #SBATCH --time=05:00:00
 #SBATCH --mem=32G
-#SBATCH --chdir=/home/mt00/projects/def-jbpoline/mt00/DATA/task-based/openneuro/multi_dataset_analysis/codes
+# Note: run sbatch from your multi_dataset_analysis/codes directory, or uncomment and set --chdir:
+# #SBATCH --chdir=/path/to/multi_dataset_analysis/codes
+
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
 set -euo pipefail
 
 mkdir -p logs
-source "/home/mt00/venvs/pydfc/bin/activate"
+source "$VENV_PATH"
 
-MULTI_DATASET_INFO="/home/mt00/pydfc/dFC/task_dFC/run_scripts_slurm/multi_dataset_info.json"
+MULTI_DATASET_INFO="$PYDFC_CODE_DIR/task_dFC/run_scripts_slurm/multi_dataset_info.json"
 
 SCRIPT_NAME=${1:-}
 SIMUL_OR_REAL=${2:-real}
-SCRIPT_DIR="/home/mt00/pydfc/dFC/task_dFC/multi_dataset_analysis"
+SCRIPT_DIR="$PYDFC_CODE_DIR/task_dFC/multi_dataset_analysis"
 SCRIPT_PATH="$SCRIPT_DIR/$SCRIPT_NAME"
 
 if [ -z "$SCRIPT_NAME" ]; then
diff --git a/task_dFC/run_scripts_slurm/run_dFC.sh b/task_dFC/run_scripts_slurm/run_dFC.sh
index c785690..f8ec43e 100644
--- a/task_dFC/run_scripts_slurm/run_dFC.sh
+++ b/task_dFC/run_scripts_slurm/run_dFC.sh
@@ -15,10 +15,15 @@ echo "Number subjects found: `cat $SUBJECT_LIST | wc -l`"
 SUBJECT_ID=`sed -n "${SLURM_ARRAY_TASK_ID}p" $SUBJECT_LIST`
 echo "Subject ID: $SUBJECT_ID"
 
-# Activate  virtual environment
-source "/home/mt00/venvs/pydfc/bin/activate"
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
-python "/home/mt00/pydfc/dFC/task_dFC/dFC_assessment.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/dFC_assessment.py" \
 --dataset_info $DATASET_INFO \
 --methods_config $METHODS_CONFIG \
 --participant_id $SUBJECT_ID
diff --git a/task_dFC/run_scripts_slurm/run_fmriprep.sh b/task_dFC/run_scripts_slurm/run_fmriprep.sh
index 60e7da4..06cd377 100644
--- a/task_dFC/run_scripts_slurm/run_fmriprep.sh
+++ b/task_dFC/run_scripts_slurm/run_fmriprep.sh
@@ -8,7 +8,11 @@
 
 module load apptainer
 
-source "/home/mt00/venvs/nipoppy_env/bin/activate"
+# ---- Cluster configuration (set these for your system) ----
+NIPOPPY_VENV_PATH="/path/to/your/nipoppy_venv/bin/activate"
+# -----------------------------------------------------------
+
+source "$NIPOPPY_VENV_PATH"
 
 SUBJECT_LIST="./subj_list.txt"
 
diff --git a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
index 6e3c789..b2f9df4 100644
--- a/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
+++ b/task_dFC/run_scripts_slurm/run_nifti_to_roi.sh
@@ -19,12 +19,17 @@ echo "Number of subjects: $(wc -l < "$SUBJECT_LIST")"
 SUBJECT_ID=$(sed -n "${SLURM_ARRAY_TASK_ID}p" "$SUBJECT_LIST")
 echo "Subject ID: $SUBJECT_ID"
 
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
+
 # -----------------------------
 # Environment
 # -----------------------------
-source "/home/mt00/venvs/pydfc/bin/activate"
+source "$VENV_PATH"
 
-python "/home/mt00/pydfc/dFC/task_dFC/nifti_to_roi_signal.py" \
+python "$PYDFC_CODE_DIR/task_dFC/nifti_to_roi_signal.py" \
     --dataset_info $DATASET_INFO \
     --participant_id $SUBJECT_ID \
     --denoising_strategy $DENOISING_STRATEGY
diff --git a/task_dFC/run_scripts_slurm/run_report.sh b/task_dFC/run_scripts_slurm/run_report.sh
index f094835..12c6ebc 100644
--- a/task_dFC/run_scripts_slurm/run_report.sh
+++ b/task_dFC/run_scripts_slurm/run_report.sh
@@ -9,10 +9,15 @@
 DATASET_INFO="./dataset_info.json"
 SUBJ_LIST="./subj_list.txt"
 
-# Activate  virtual environment
-source "/home/mt00/venvs/pydfc/bin/activate"
+# ---- Cluster configuration (set these for your system) ----
+VENV_PATH="/path/to/your/venv/bin/activate"
+PYDFC_CODE_DIR="/path/to/pydfc"
+# -----------------------------------------------------------
 
-python "/home/mt00/pydfc/dFC/task_dFC/generate_report.py" \
+# Activate virtual environment
+source "$VENV_PATH"
+
+python "$PYDFC_CODE_DIR/task_dFC/generate_report.py" \
 --dataset_info $DATASET_INFO \
 --subj_list $SUBJ_LIST
 

From edcfab3213d1fc4766a4e0413369adb271683ca4 Mon Sep 17 00:00:00 2001
From: mtorabi59 <mohammad.torabi@mail.mcgill.ca>
Date: Thu, 11 Jun 2026 14:59:58 -0400
Subject: [PATCH 401/401] minor

---
 simul_dFC/README.rst | 2 +-
 task_dFC/README.rst  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/simul_dFC/README.rst b/simul_dFC/README.rst
index 2c54299..86eaf18 100644
--- a/simul_dFC/README.rst
+++ b/simul_dFC/README.rst
@@ -24,4 +24,4 @@ Set ``VENV_PATH`` and ``PYDFC_CODE_DIR`` in the cluster configuration block at t
 
 The script expects a ``subj_list.txt`` (one subject ID per line), a ``dataset_info.json``, and a ``tasks_info.json`` in the same directory as the run script.
 
-Simulated outputs are consumed directly by the ``task_dFC`` pipeline starting at ``dFC_assessment.py``.
+Simulated outputs are consumed directly by the ``task_dFC`` pipeline starting at ``FCS_estimate.py``.
diff --git a/task_dFC/README.rst b/task_dFC/README.rst
index dd78b7c..0de41ea 100644
--- a/task_dFC/README.rst
+++ b/task_dFC/README.rst
@@ -8,7 +8,7 @@ PydFC: task_dFC Module Documentation
 
 The ``task_dFC`` module provides a scalable, open-source Python solution for the **large-scale benchmarking and application of dynamic functional connectivity (dFC) methods**.
 
-Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting moment-to-moment cognitive states** — specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
+Its core purpose is to apply end-to-end analytical workflows to fMRI data to assess the efficacy of various dFC methodologies in **predicting ongoing cognitive states** — specifically, distinguishing between moments of task engagement versus rest at the single repetition time (TR) resolution.
 
 Methods Implemented
 -------------------
@@ -129,5 +129,5 @@ Typical submission order::
     sbatch run_FCS.sh
     sbatch --array=1-N run_dFC.sh
     sbatch run_ML.sh
-    sbatch run_report.sh
+    sbatch run_report.sh         # (optional)
     sbatch run_across_dataset_analysis.sh <script_name>