From d160e9a103bd68341bb1bf1d4fb5e8a93e2de31c Mon Sep 17 00:00:00 2001
From: nandarania <nandaraniabril@gmail.com>
Date: Thu, 18 Aug 2022 16:38:46 -0400
Subject: [PATCH 1/2] LCMS scan extraction using argparser pandas and numpy.

---
 .../python_scripts/LCMS_Spectra/README.md     |  11 ++
 .../LCMS_Spectra/requirements.txt             |   3 +
 .../LCMS_Spectra/scan_extractions.py          | 136 ++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/README.md
 create mode 100644 LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/requirements.txt
 create mode 100644 LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py

diff --git a/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/README.md b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/README.md
new file mode 100644
index 00000000..38dcbcc7
--- /dev/null
+++ b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/README.md
@@ -0,0 +1,11 @@
+# MS1Spectra
+This combines a NegID.csv input file with LCMS scans to consctruct an output.csv.
+
+## Inputs
+
+    fn_negID = NegID file path
+    path = LCMS scans folder file path
+    fn_output = Desired output file path
+    mz_id = Mass to Charge Ratio NegID column id
+    rt_id = Retention Time NegID column id
+    dDa = Zoom window around NegID's mz
diff --git a/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/requirements.txt b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/requirements.txt
new file mode 100644
index 00000000..72287444
--- /dev/null
+++ b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/requirements.txt
@@ -0,0 +1,3 @@
+pandas==1.4.2
+numpy==1.23.1
+argparse==1.1
diff --git a/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py
new file mode 100644
index 00000000..96913a5b
--- /dev/null
+++ b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py
@@ -0,0 +1,136 @@
+import pandas as pd
+import numpy as np
+from os import listdir
+import argparse
+
+def find_nearest(array, value):
+    #https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array
+    array = np.asarray(array)
+    idx = (np.abs(array - value)).argmin()
+    return idx, array[idx]
+
+def get_nearest_rows(df_lcms, times, indices, rt, mz):
+    # find the nearest time from scans file to negID's rt
+    selected_id, selected_rt = find_nearest(times, rt)
+    # get the corresponding rows from the scans file for that time
+    start = indices[selected_id]
+    end = indices[selected_id+1] if selected_id + 1 < indices.size else len(df_lcms.index)
+    return df_lcms.iloc[start:end, :].copy()
+
+def filter_1(df, mz):
+    # filtering could be applied but was generally not necessary
+    selected_mz_ID = df['m/z'].sub(mz).abs().idxmin()
+    Lowest_Intensity = df.at[selected_mz_ID,"Intensity"]*0.01
+    df.drop(df[(df.Intensity < Lowest_Intensity) & (df.Zoom == 0)].index, inplace = True)
+    # alternate approach
+    # return df[ (df["Intensity"] > Lowest_Intensity) | (df["Zoom"] == 1) ]
+
+def construct_table(df_lcms, lcms_times, lcms_indices, rt, fID, mz, dDa, filename):
+    df = get_nearest_rows(df_lcms, lcms_times, lcms_indices, rt, mz)
+    df.columns = ["Feature", "m/z", "Intensity"]
+    df["Feature"] = fID
+    df["Isotope"] = ""
+    df["Formula"] = ""
+    df["PredAbundance"] = ""
+    df["File"] = filename
+    df["Zoom"] = (np.abs(df["m/z"] - mz) < dDa) + 0
+    filter_1(df, mz)
+    return df.round({"m/z":5, "Intensity":1})
+
+def extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa):
+    '''
+        load lcms scan
+        get unique times
+        for each row in negID
+        get rows nearest to that retention time from lcms scan
+        print the rows to the csv
+    '''
+    df_lcms = pd.read_csv(fn_lcms)
+    lcms_times, lcms_indices = np.unique(df_lcms.rt, return_index=True)
+    filename = fn_lcms.split("/")[-1]
+
+    for row_id in range(len(df_NegID.index)):
+        mz = df_NegID.iat[row_id,mz_id]      #mass to charge ratio
+        rt = df_NegID.iat[row_id,rt_id] * 60 #retention time
+        fID = df_NegID.at[row_id,"row.ID"]   #feature number
+
+        df = construct_table(df_lcms, lcms_times, lcms_indices, rt, fID, mz, dDa, filename)
+        df.to_csv(fn_output, mode='a', index=False, header=False)
+
+            # example:
+            # 227492,151.07624,33665.4,,,,FoamSample01_Targeted_Neg.csv,0
+            # 227492,152.07179,39222.5,,,,FoamSample01_Targeted_Neg.csv,0
+
+def get_lcms_filenames(path):
+    '''
+        get lcms csv files (corresponding to mzxml files)
+        filenames for liquid chromatography mass spectrometry (LCMS) scans
+    '''
+    # load csv scan files
+    allfiles = listdir(path)
+    mzxmlfiles = sorted([i for i in allfiles if i[-5:].lower() == "mzxml"])
+    # csvfiles = [i.replace("mzXML", "csv") for i in mzxmlfiles]
+    csvfiles = [i[:-5] + "csv" for i in mzxmlfiles] # catches both mzXML and mzxml
+    # "FoamSample01_Targeted_Neg.mzxml"[:-5] == "FoamSample01_Targeted_Neg."
+    # +"csv" = "FoamSample01_Targeted_Neg.csv"
+    csvfiles = [path+i for i in csvfiles if i in allfiles]
+    return csvfiles
+
+def write_header(fn_output):
+    cols = ["Feature", "m/z", "Intensity", "Isotope", "Formula", "PredAbundance", "File", "Zoom"]
+    df = pd.DataFrame({i:[] for i in cols})
+    df.to_csv(fn_output, index=False)
+
+def extract_all_lcms_scans(fn_negID, path, fn_output, mz_id, rt_id, dDa):
+    '''
+        load negID file
+        get lcms csv files (corresponding to mzxml files)
+        write to a target file for each scan
+    '''
+    df_NegID = pd.read_csv(fn_negID)
+    fns_lcms = get_lcms_filenames(path)
+    write_header(fn_output)
+    for fn_lcms in fns_lcms:
+        extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa)
+
+parser = argparse.ArgumentParser(description='msspectra')
+
+parser.add_argument('--fn_negID',
+                    dest='fn_negID',
+                    type=str,
+                    help='NegIDed file path',
+                    default="NegIDed_FIN.csv")
+parser.add_argument('--path',
+                    dest='path',
+                    type=str,
+                    help='Test files path',
+                    default="./")
+parser.add_argument('--fn_output',
+                    dest='fn_output',
+                    type=str,
+                    help='output file path',
+                    default="scan_extractions_output.csv")
+parser.add_argument('--mz_id',
+                    dest='mz_id',
+                    type=int,
+                    help='NegID column index (Mass to charge ratio)',
+                    default=5)
+parser.add_argument('--rt_id',
+                    dest='rt_id',
+                    type=int,
+                    help='NegID column index (Retention time)',
+                    default=6)
+parser.add_argument('--dDa',
+                    dest='dDa',
+                    type=int,
+                    help='Zoom window in Daltons',
+                    default=5)
+
+def run_program(*args, **kwargs):
+    print(locals())
+    extract_all_lcms_scans(fn_negID = kwargs["fn_negID"], path = kwargs["path"], fn_output = kwargs["fn_output"],
+         mz_id = kwargs["mz_id"], rt_id = kwargs["rt_id"], dDa = kwargs["dDa"])
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    run_program(**vars(args))
\ No newline at end of file

From ce3104394aa9c12286429e66abab48b44f733a7b Mon Sep 17 00:00:00 2001
From: nandarania <nandaraniabril@gmail.com>
Date: Sat, 27 Aug 2022 17:05:01 -0400
Subject: [PATCH 2/2] Added process isotope feature configurable through CLI.

---
 .../LCMS_Spectra/scan_extractions.py          | 77 +++++++++++++++----
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py
index 96913a5b..e9c45e28 100644
--- a/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py
+++ b/LipidMatch_Flow_3.5/Background_Files_LipidMatch_Flow/LipidMatch_Flow/python_scripts/LCMS_Spectra/scan_extractions.py
@@ -3,13 +3,25 @@
 from os import listdir
 import argparse
 
+def get_df_iso():
+    arr_iso_rmass = [0.99939,1.9958,1.00336,2.00671,1.99705,3.9941,5.99115,7.9882,2.00424,1.99795,3.99591,0.99704]
+    arr_iso_symbol = ["33S","34S","13C","13C2","37Cl","37Cl2","37Cl3","37Cl4","18O","81Br","81Br2","15N"]
+    arr_iso_rint = [0.7893,4.4306,1.0816,0.0117,32.3977,10.4961,3.4005,0.8501,0.2005,97.5114,48.7557,0.3613]
+
+    df = pd.DataFrame({
+        'symbol': arr_iso_symbol
+        ,'rmass': arr_iso_rmass
+        ,'intensity': arr_iso_rint
+        })
+    return df
+
 def find_nearest(array, value):
     #https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array
     array = np.asarray(array)
     idx = (np.abs(array - value)).argmin()
     return idx, array[idx]
 
-def get_nearest_rows(df_lcms, times, indices, rt, mz):
+def get_nearest_rows(df_lcms, times, indices, rt):
     # find the nearest time from scans file to negID's rt
     selected_id, selected_rt = find_nearest(times, rt)
     # get the corresponding rows from the scans file for that time
@@ -17,27 +29,55 @@ def get_nearest_rows(df_lcms, times, indices, rt, mz):
     end = indices[selected_id+1] if selected_id + 1 < indices.size else len(df_lcms.index)
     return df_lcms.iloc[start:end, :].copy()
 
-def filter_1(df, mz):
+def remove_under_lowest_percentage(df, dmz, lowest_percentage):
     # filtering could be applied but was generally not necessary
-    selected_mz_ID = df['m/z'].sub(mz).abs().idxmin()
-    Lowest_Intensity = df.at[selected_mz_ID,"Intensity"]*0.01
+    selected_mz_ID = dmz.idxmin()
+    Lowest_Intensity = df.at[selected_mz_ID,"Intensity"]*lowest_percentage
     df.drop(df[(df.Intensity < Lowest_Intensity) & (df.Zoom == 0)].index, inplace = True)
     # alternate approach
     # return df[ (df["Intensity"] > Lowest_Intensity) | (df["Zoom"] == 1) ]
 
-def construct_table(df_lcms, lcms_times, lcms_indices, rt, fID, mz, dDa, filename):
-    df = get_nearest_rows(df_lcms, lcms_times, lcms_indices, rt, mz)
+def iso_string_gen(df_iso_matches, match_i):
+    ''' For a particular index, compose the isotopic string. '''
+    iso = df_iso_matches.iloc[match_i]
+    return f"{iso.symbol}({iso.intensity:0.2f}%;{iso.d_rmass_to_dmzscan_i:0.5f}Da)"
+
+def process_isotopes(df, dmz, isotope_threshold):
+    selected_mz_ID = dmz.idxmin()
+    df.at[selected_mz_ID, "Isotope"] = "M"
+    tmz_scan = df.at[selected_mz_ID,"m/z"]
+    # gets the index of the last row in the zoom window
+    last_zoom_idx = df[df["Zoom"] == 1].index[-1]
+    dmz_scan = df.loc[selected_mz_ID+1:last_zoom_idx, "m/z"] - tmz_scan
+    df_iso = get_df_iso()
+
+    for dmz_i in dmz_scan.index.values:
+        # lists isotopic matches where the relative mass is close enough to the distance from a m/z to the tmz_scan
+        df_iso["d_rmass_to_dmzscan_i"] = np.abs(df_iso.rmass - dmz_scan[dmz_i])
+        df_iso_matches = df_iso[ df_iso.d_rmass_to_dmzscan_i < isotope_threshold ].sort_values(by=["d_rmass_to_dmzscan_i", "intensity"])
+        iso_strings = []
+        for match_i in range(df_iso_matches.shape[0]):
+            iso_strings += [iso_string_gen(df_iso_matches, match_i)]
+        iso_string = ";".join(iso_strings)
+        df.at[dmz_i, "Isotope"] = iso_string
+
+def construct_table(df_lcms, lcms_times, lcms_indices, rt, fID, mz, dDa, filename, lowest_percetange, isotope_threshold):
+    df = get_nearest_rows(df_lcms, lcms_times, lcms_indices, rt)
     df.columns = ["Feature", "m/z", "Intensity"]
     df["Feature"] = fID
     df["Isotope"] = ""
     df["Formula"] = ""
     df["PredAbundance"] = ""
     df["File"] = filename
-    df["Zoom"] = (np.abs(df["m/z"] - mz) < dDa) + 0
-    filter_1(df, mz)
+    dmz = np.abs(df["m/z"] - mz)
+    df["Zoom"] = (dmz < dDa) + 0
+    if lowest_percetange > 0:
+        remove_under_lowest_percentage(df, dmz, lowest_percetange)
+    if isotope_threshold > 0:
+        process_isotopes(df, dmz, isotope_threshold)
     return df.round({"m/z":5, "Intensity":1})
 
-def extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa):
+def extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa, lowest_percentage, isotope_threshold):
     '''
         load lcms scan
         get unique times
@@ -54,7 +94,7 @@ def extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa):
         rt = df_NegID.iat[row_id,rt_id] * 60 #retention time
         fID = df_NegID.at[row_id,"row.ID"]   #feature number
 
-        df = construct_table(df_lcms, lcms_times, lcms_indices, rt, fID, mz, dDa, filename)
+        df = construct_table(df_lcms, lcms_times, lcms_indices, rt, fID, mz, dDa, filename, lowest_percentage, isotope_threshold)
         df.to_csv(fn_output, mode='a', index=False, header=False)
 
             # example:
@@ -81,7 +121,7 @@ def write_header(fn_output):
     df = pd.DataFrame({i:[] for i in cols})
     df.to_csv(fn_output, index=False)
 
-def extract_all_lcms_scans(fn_negID, path, fn_output, mz_id, rt_id, dDa):
+def extract_all_lcms_scans(fn_negID, path, fn_output, mz_id, rt_id, dDa, lowest_percentage, isotope_threshold):
     '''
         load negID file
         get lcms csv files (corresponding to mzxml files)
@@ -91,7 +131,7 @@ def extract_all_lcms_scans(fn_negID, path, fn_output, mz_id, rt_id, dDa):
     fns_lcms = get_lcms_filenames(path)
     write_header(fn_output)
     for fn_lcms in fns_lcms:
-        extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa)
+        extract_lcms_at_negID_rts(fn_output, df_NegID, fn_lcms, mz_id, rt_id, dDa, lowest_percentage, isotope_threshold)
 
 parser = argparse.ArgumentParser(description='msspectra')
 
@@ -125,11 +165,22 @@ def extract_all_lcms_scans(fn_negID, path, fn_output, mz_id, rt_id, dDa):
                     type=int,
                     help='Zoom window in Daltons',
                     default=5)
+parser.add_argument('--filter',
+                    dest='lowest_percentage',
+                    type=float,
+                    help='Filtering cutoff value, ex: 0.01 is 1 percent of target intensity.',
+                    default=0)
+parser.add_argument('--isotope_threshold',
+                    dest='isotope_threshold',
+                    type=float,
+                    help='Isotope cutoff value for determining a match, ex: 0.005Da.',
+                    default=0.005)
 
 def run_program(*args, **kwargs):
     print(locals())
     extract_all_lcms_scans(fn_negID = kwargs["fn_negID"], path = kwargs["path"], fn_output = kwargs["fn_output"],
-         mz_id = kwargs["mz_id"], rt_id = kwargs["rt_id"], dDa = kwargs["dDa"])
+         mz_id = kwargs["mz_id"], rt_id = kwargs["rt_id"], dDa = kwargs["dDa"]
+         , lowest_percentage = kwargs["lowest_percentage"], isotope_threshold = kwargs["isotope_threshold"])
 
 if __name__ == '__main__':
     args = parser.parse_args()