-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadd_negatives.py
More file actions
96 lines (76 loc) · 3.77 KB
/
add_negatives.py
File metadata and controls
96 lines (76 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import pandas as pd
import numpy as np
def add_negative_samples_from_masterlist(df, file_name, masterlist_path):
"""
Adds negative samples from the master list that are not present in the input DataFrame.
Copies specific additional columns from the master list.
The master list (library) is resolved directly from the LIBRARY_NAME column
of the input data, then loaded from `<library>.xlsx` or `<library>.csv` in
`masterlist_path`. No separate mapping file is needed.
Args:
df (pd.DataFrame): The input DataFrame.
file_name (str): The name of the processed file (used only for log messages).
masterlist_path (str): The directory containing master list files.
Returns:
pd.DataFrame: Updated DataFrame with added negative samples.
"""
# Resolve the library name straight from the LIBRARY_NAME column
# (one file = one library).
if "LIBRARY_NAME" not in df.columns:
print(f"Warning: No LIBRARY_NAME column in {file_name}. Skipping negative sample addition.")
return df
library_values = df["LIBRARY_NAME"].dropna()
if library_values.empty:
print(f"Warning: LIBRARY_NAME column is empty in {file_name}. Skipping negative sample addition.")
return df
masterlist_name = str(library_values.iloc[0]).strip()
# Resolve the master list file, accepting either .xlsx or .csv (xlsx wins
# if both exist).
masterlist_file = None
for ext in (".xlsx", ".csv"):
candidate = os.path.join(masterlist_path, f"{masterlist_name}{ext}")
if os.path.exists(candidate):
masterlist_file = candidate
break
if masterlist_file is None:
print(f"Warning: Master list file '{masterlist_name}' (.xlsx or .csv) not found in {masterlist_path}. Skipping negative sample addition.")
return df
# Load the master list file with the reader matching its extension
if masterlist_file.lower().endswith(".csv"):
master_df = pd.read_csv(masterlist_file)
else:
master_df = pd.read_excel(masterlist_file)
# Ensure 'SMILES' column exists in the master list
if "SMILES" not in master_df.columns:
raise ValueError(f"Master list file {masterlist_name} must contain a 'SMILES' column")
# Identify SMILES that are NOT present in df
existing_smiles = set(df["SMILES"].dropna())
new_entries = master_df[~master_df["SMILES"].isin(existing_smiles)].copy()
if new_entries.empty:
print(f"No new negative samples found for {file_name}.")
return df
# Assign BINARY_LABEL = 0 for these new negative samples (integer 0/1 convention)
new_entries["BINARY_LABEL"] = 0
new_entries["ENRICHMENT"] = np.nan
new_entries["PVALUE"] = np.nan
new_entries["MassSpec_Detected"] = "N"
new_entries["TARGET_ID"] = df["TARGET_ID"][0]
df["MassSpec_Detected"] = "Y"
# Define column mapping (Master List → df)
column_mapping = {
"SGC ID for Component": "COMPOUND_ID", # Mapping 'SGC ID for Component' from master list to 'COMPOUND_ID' in df
"SGC ID for Pool": "POOL_NAME",
"formula": "COMPOUND_FORMULA"
# Add more mappings if needed
}
# Apply column mappings
for master_col, df_col in column_mapping.items():
print("here")
if master_col in new_entries.columns:
new_entries[df_col] = new_entries[master_col]
# Select only relevant columns to append
columns_to_add = ["SMILES", "BINARY_LABEL","ENRICHMENT","PVALUE","MassSpec_Detected", "TARGET_ID"] + list(column_mapping.values())
df = pd.concat([df, new_entries[columns_to_add]], ignore_index=True)
print(f"Added {len(new_entries)} negative samples to {file_name} from {masterlist_name}, including columns: {list(column_mapping.values())}")
return df