diff --git a/.gitignore b/.gitignore index c557836..2d47b79 100644 --- a/.gitignore +++ b/.gitignore @@ -13,10 +13,11 @@ Run_scripts/* run_experiments_*.sh test*.txt eval*.sh +fairnessbench_analysis/*/*.png +fairnessbench_analysis/*/*.csv - - - +# path +path.py # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/fairnessbench_analysis/acc_di_tradeoff_heatmap.py b/fairnessbench_analysis/acc_di_tradeoff_heatmap.py new file mode 100644 index 0000000..e116652 --- /dev/null +++ b/fairnessbench_analysis/acc_di_tradeoff_heatmap.py @@ -0,0 +1,129 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff','final_flake8_score'] +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider=wider[cols] +wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob'].isin(["balance", "implicit", "best"])].copy() + +# Adding fairness column +df = wider_balance.copy() +df["fair"] = 1 - (df['di'] - 1).abs() +df["fair"] = df["fair"].clip(lower=0) + +def pareto_max(df, x, y): + """ + Return Pareto optimal points (maximizing both x and y). + """ + data = df[[x, y]].to_numpy() + keep = np.ones(len(df), dtype=bool) + + for i in range(len(df)): + # point j dominates i if: + # j is >= in both AND > in at least one + dominates = np.all(data >= data[i], axis=1) & np.any(data > data[i], axis=1) + dominates[i] = False + if np.any(dominates): + keep[i] = False + + return df[keep] + +pareto_all = [] + +# loop through each panel and research problem +for (dataset, model, prob), g in df.groupby(['dataset','model', 'rsch_prob']): + front = pareto_max(g, 'acc', "fair") + + # radius and angle for Pareto points + front = front.copy() + front["r"] = np.sqrt(front['acc']**2 + front["fair"]**2) + front["theta"] = np.arctan2(front["fair"], front['acc']) # radians + + pareto_all.append(front) + +pareto_df = pd.concat(pareto_all, ignore_index=True) + +r_summary = pareto_df.groupby(['dataset','model', 'rsch_prob'])["r"].mean().reset_index() +r_summary = r_summary.rename(columns={"r": "r_mean"}) + +# Function to calculate circular mean +def circ_mean(theta): + return np.arctan2(np.mean(np.sin(theta)), np.mean(np.cos(theta))) + +theta_summary = ( + pareto_df.groupby(['dataset','model', 'rsch_prob'])["theta"] + .apply(circ_mean) + .reset_index() + .rename(columns={"theta": "theta_mean"}) +) + +summary = r_summary.merge(theta_summary, on=['dataset','model', 'rsch_prob']) +summary["theta_mean_deg"] = np.degrees(summary["theta_mean"]) + +angle_ranges = summary.groupby("rsch_prob")["theta_mean_deg"].agg(["min","max"]) + +# Function to calculate the overlap between two ranges +def range_overlap(a_min, a_max, b_min, b_max): + overlap = max(0, min(a_max, b_max) - max(a_min, b_min)) + total = max(a_max, b_max) - min(a_min, b_min) + return overlap / total if total > 0 else 0 + +pairs = [("balance","implicit"), + ("balance","best"), + ("implicit","best")] + + +# centering +summary["theta_centered"] = summary["theta_mean_deg"] - 45 + +sns.set_context(context='poster',font_scale= 0.75) +cmap = sns.diverging_palette(145, 300, as_cmap=True) +summary["pm"] = summary["rsch_prob"].astype(str) + "-" + summary["model"].astype(str) +pivot = summary.pivot( + index="pm", + columns="dataset", + values="theta_centered" +) + + +plt.figure(figsize=(12,8)) + +sns.heatmap( + pivot, + cmap=cmap, + center=0, + vmin=-45, + vmax=45, + annot=True, + fmt=".1f" + ) + + +plt.xlabel("Dataset") +plt.ylabel("Research Problem | Model") +plt.tight_layout() +output= os.path.join(GRAPHS,'acc_di_tradeoff_heatmap.png') +plt.savefig(output, dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/acc_fairness_overlap_heatmap.py b/fairnessbench_analysis/acc_fairness_overlap_heatmap.py new file mode 100644 index 0000000..f8dcd16 --- /dev/null +++ b/fairnessbench_analysis/acc_fairness_overlap_heatmap.py @@ -0,0 +1,155 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) + +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff','final_flake8_score'] + +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider=wider[cols] +wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# summary df of the mean and std +summary = ( + wider.groupby(["model", "task_dataset", "rsch_prob"]) + .agg( + mean_acc=("acc","mean"), + mean_di=("di","mean"), + std_acc=("acc","std"), + std_di=("di","std") + ) + .reset_index() +) + +allowed_datasets = ["randoadult","sampadult","nondescriptive",'health'] + +results = [] + +# here we loop over each (model, research problem) group +for (model, rp), group in summary.groupby(["model","rsch_prob"]): + + # find baseline (adult) + base_row = group[group["task_dataset"]=="adult"] +#if a model+research-problem doesn’t have an Adult row, I can’t compute diffs, so skip it + if base_row.empty: + continue +# extracting adults mean n std , .iloc[0] grabs the first row value (assumes only one Adult row exists in this group). + base_di = base_row["mean_di"].iloc[0] + base_acc = base_row["mean_acc"].iloc[0] + # build baseline DI interval using variance + base_min_di = base_di - base_row["std_di"].iloc[0] + base_max_di = base_di + base_row['std_di'].iloc[0] + # build baseline ACC interval using std + base_min_acc = base_acc - base_row["std_acc"].iloc[0] + base_max_acc = base_acc + base_row['std_acc'].iloc[0] +# iterate through each dataset std (adult, randoadult, sampadult, …) for this model+rproblem + for _, row in group.iterrows(): + # get dataset name for this row + dataset = row["task_dataset"] + # skip baseline itself, don't compare adult to itself + if dataset == "adult": + continue + if dataset not in allowed_datasets: + continue + + change_type = "data_change" + if row["task_dataset"] == "nondescriptive": + change_type = "context_change" + row_max_di = (row["mean_di"] + row['std_di']) + row_min_di = (row["mean_di"] - row['std_di']) + + row_max_acc = (row["mean_acc"] + row['std_acc']) + row_min_acc = (row["mean_acc"] - row['std_acc']) + results.append({ + "model": model, + "rsch_prob": rp, + "baseline_dataset": "adult", + "comparison_dataset": dataset, + # Differences in mean fairness and accuracy, these measure how far the means move from Adult baseline. + "fairness_diff": abs(base_di - row["mean_di"]), + "accuracy_diff": abs(base_acc - row["mean_acc"]), + 'min_fair_dff':row_min_di, + 'max_fair_diff':row_max_di, + 'min_acc_diff':row_min_acc, + 'max_acc_diff':row_max_acc, + # compute overlap between baseline interval and comparison interval + # overlap = 0 intervals are disjoint (stronger evidence of change) + # overlap large intervals similar/overlapping (weaker evidence of change) + 'overlap_fair': max(0,min(base_max_di,row_max_di)-max(base_min_di,row_min_di)), # width + # Baseline interval length + 'len_man_min': base_max_di - base_min_di, + 'overlap_acc': max(0,min(base_max_acc,row_max_acc)-max(base_min_acc,row_min_acc)), # width + 'len_acc_minmax':base_max_acc - base_min_acc, + "change_type": change_type + }) + + +sens_df = pd.DataFrame(results) +sens_df['final_overlap_fair'] = sens_df['overlap_fair'] / sens_df['len_man_min'] +sens_df['final_overlap_acc'] = sens_df['overlap_acc'] / sens_df['len_acc_minmax'] + +# heatmap for fairness_overlap +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_dataset", + values="final_overlap_fair", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) + +plt.title("Fairness Sensitivity Across Dataset Variants") +plt.xlabel("Comparison Dataset") +plt.ylabel("Research Problem | Model") + +output= os.path.join(GRAPHS,'fairness_overlap_heatmap.png') +plt.savefig(output, dpi=300, bbox_inches='tight') + +# heatmap for acc_overlap +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_dataset", + values="final_overlap_acc", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) + +plt.title("Accuracy Sensitivity Across Dataset Variants") +plt.xlabel("Comparison Dataset") +plt.ylabel("Research Problem | Model") +output= os.path.join(GRAPHS,'acc_overlap_heatmap.png') +plt.savefig(output, dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py new file mode 100644 index 0000000..69714c3 --- /dev/null +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -0,0 +1,59 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS +# Loading useful dataframes +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + + +# Removing missing rows fairnessBench +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +# filtering the adult dataset and the di task_metric +adult= wider_code[wider_code['task_dataset']=='adult'] +adult_di=adult[adult['task_metric']=='di'] + +long_df = adult_di.melt( + id_vars=['model','task_dataset','task_metric','resrch_prob','dem'], + value_vars=[ + '1. Data Collection and Processing', + '2. Bias Detection and Mitigation', + '3. Fairness Metric Selection', + '4. Model Selection and Training', + '5. Evaluation and Testing' + ], + var_name='rubric_section', + value_name='score' +) + +sns.set_context(context='poster',font_scale=1.0) +plt.figsize=(16,12) +m=sns.catplot( + data=long_df, + x="rubric_section", + y="score", + hue="model", + col="resrch_prob", + row='dem', + kind="bar", + aspect=2 +) +m.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +ax=m.axes +ax = m.axes +for ax in m.axes.flatten(): + plt.setp(ax.get_xticklabels(), rotation=30) + ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) + +output = os.path.join(GRAPHS, 'adult_di_code_llm_eval.png') +plt.savefig(output, dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py new file mode 100644 index 0000000..7af2c7e --- /dev/null +++ b/fairnessbench_analysis/adult_fairness.py @@ -0,0 +1,64 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) + + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +wider_code = wider_code[wider_cols] + +# Filtering only adult dataset from the dataframe +wider_adult = wider_code[wider_code['task_dataset']=='adult'] +fairness_metrics= ['di','error_rate_ratio','statistical_parity_diff','equal_opp_diff','error_rate_diff','false_omission_rate_diff'] +wider_ADULT = ( + wider_adult.groupby(['model','task-dem','task_metric'])[fairness_metrics].mean() +).reset_index() +ad_df= wider_ADULT['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_ADULT=pd.concat([wider_ADULT,ad_df],axis=1) +wider_ADULT=wider_ADULT.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_ADULT.loc[:, 'task_metric_value'] = wider_ADULT.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.catplot(data=wider_ADULT,x='resrch_prob',y='task_metric_value',hue='dem',row='task_metric',col='model',kind='bar' + ,aspect=1) + +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +ax=g.axes +for i in range(4): + ax[0,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[1,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[2,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + +output = os.path.join(GRAPHS, 'adult_fairness.png') +plt.savefig(output,dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py new file mode 100644 index 0000000..c9dc503 --- /dev/null +++ b/fairnessbench_analysis/balancing_fairness.py @@ -0,0 +1,133 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +wider=wider[cols] + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob']=='balance'] +wider_balance=wider_balance.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_balance= wider_balance.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_balance.loc[:, 'task_metric_value'] = wider_balance.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_balance,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'balancing_fairness.png') +plt.savefig(output,dpi=400,bbox_inches='tight') + + +# Filtering only best task from the dataframe +wider_best = wider[wider['rsch_prob']=='best'] +wider_best=wider_best.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_best= wider_best.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_best.loc[:, 'task_metric_value'] = wider_best.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_best,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'balancing_fairness_best.png') +plt.savefig(output,dpi=400,bbox_inches='tight') + + +# Filtering only implicit task from the dataframe +wider_implicit = wider[wider['rsch_prob']=='implicit'] +wider_implicit=wider_implicit.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_implicit= wider_implicit.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_implicit.loc[:, 'task_metric_value'] = wider_implicit.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_implicit,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'balancing_fairness_implicit.png') +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py new file mode 100644 index 0000000..906635f --- /dev/null +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -0,0 +1,54 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +# Loading useful dataframes +code_eval = pd.read_csv(CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv') +log_eval = pd.read_csv(CSV_FILES/'Results_Final_log_clean2025-09-18T00:48:52.486398.csv') +perf_df= pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') + + +# Removing missing rows +code_eval= code_eval.dropna(how="any") +code_eval = code_eval.fillna(0) +log_eval= log_eval.dropna(how='any') +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.fillna(0) + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','final_flake8_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') +output= os.path.join(GRAPHS,'codeval') +sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) + + +# log eval +task_data_metric = log_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_log = pd.concat([log_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id',"1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +wider_log = wider_log[wider_cols] +wider_log.head() + +score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') + + +output=os.path.join(GRAPHS,'logval') +sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) + diff --git a/fairnessbench_analysis/comparing_flake8_bal_be_impli.py b/fairnessbench_analysis/comparing_flake8_bal_be_impli.py new file mode 100644 index 0000000..01310a1 --- /dev/null +++ b/fairnessbench_analysis/comparing_flake8_bal_be_impli.py @@ -0,0 +1,48 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff','final_flake8_score'] +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider=wider[cols] +wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob'].isin(["balance", "implicit", "best"])].copy() +sns.set_context(context='poster',font_scale= 0.75) +g=sns.catplot( + data=wider_balance, + x="rsch_prob", + y="final_flake8_score", + col="model", + row="dataset", + kind="bar", + height=4, + aspect=1 +) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +for ax in g.axes.flat: + ax.axhline(y=85, color='black', linestyle='--') + +output= os.path.join(GRAPHS,'comparing_flake8_bal_be_impli.png') +plt.savefig(output, dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py new file mode 100644 index 0000000..2b4c237 --- /dev/null +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -0,0 +1,57 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file= CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +# Correlation between flake8 and code llm eval on claude_adult_di_erd task +code_cols=['1. Data Collection and Processing','2. Bias Detection and Mitigation','3. Fairness Metric Selection','4. Model Selection and Training', '5. Evaluation and Testing'] +group_cols = ["model", "task_dataset", "resrch_prob", "task_metric"] # Add 'task_dem' if needed + +def flake8_corr_matrix(group): + # Compute correlation between flake8_score and each rubric section + corrs = [group["final_flake8_score"].corr(group[rubric]) for rubric in code_cols] + return pd.Series(corrs, index=code_cols) + +corrs = ( + wider_code.groupby(group_cols) + .apply(flake8_corr_matrix) + .reset_index() +) +corrs=corrs.fillna(0) + +group_filter = ( + (corrs['model'] == 'claude-3-7-sonnet-20250219') & + (corrs['task_dataset'] == 'adult') & + (corrs['resrch_prob'] == 'balance') & + (corrs['task_metric'] == 'erd') +) +corr_row = corrs.loc[group_filter, code_cols] + +plt.figure(figsize=(8, 2)) +sns.heatmap( + corr_row.values.reshape(1, -1), + annot=True, + cmap='coolwarm', + xticklabels=code_cols, + yticklabels=['Flake8 score'] +) +plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") + +output= os.path.join(GRAPHS,'flake8_vs_code_correlation.png') +plt.savefig(output,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py new file mode 100644 index 0000000..e7d9cd7 --- /dev/null +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -0,0 +1,20 @@ +import os +import pandas as pd +from path import FILES + +gemma_df = pd.read_csv(FILES/'Gemma_cv.csv') +deepseek_df= pd.read_csv(FILES/'Deepseek_cv.csv') +granite_df=pd.read_csv(FILES/'Granite_cv.csv') + +gemma_df['eval'] = 'gemma' +deepseek_df['eval'] = 'deepseek' +granite_df['eval'] = 'granite' + +cols = ['eval', 'model', 'task'] + [c for c in gemma_df.columns if c not in ['eval', 'model', 'task']] +gemma_df = gemma_df[cols] +deepseek_df = deepseek_df[cols] +granite_df = granite_df[cols] + +all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True) + +all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False) \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py new file mode 100644 index 0000000..c15b728 --- /dev/null +++ b/fairnessbench_analysis/di_across_datasets.py @@ -0,0 +1,88 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +#perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +wider_code = wider_code[wider_cols] + +# Filtering only DI from the dataframe +wider_di = wider_code[wider_code['task_metric']=='di'] +wider_DI = ( + wider_di.groupby(['task_dataset','task-dem'])[['di','acc']].mean().reset_index() +) +dem_df= wider_di['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_DI=pd.concat([wider_di,dem_df],axis=1) +wider_DI=wider_DI.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# ploting the scatter plot for di vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_DI,x='acc',y='di',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +output= os.path.join(GRAPHS,'di_vs_acc_scatter.png') +plt.savefig(output, dpi=300, bbox_inches='tight') + + +# checking SPD vs Acc +wider_spd = wider_code[wider_code['task_metric']=='spd'] +wider_SPD = ( + wider_spd.groupby(['task_dataset','task-dem'])[['statistical_parity_diff','acc']].mean().reset_index() +) +dem_df= wider_spd['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_SPD=pd.concat([wider_spd,dem_df],axis=1) +wider_SPD=wider_SPD.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +# ploting the scatter plot for spd vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_SPD,x='acc',y='statistical_parity_diff',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=0.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +output= os.path.join(GRAPHS,'spd_vs_acc_scatter.png') +plt.savefig(output, dpi=300, bbox_inches='tight') + + +# checking EOD vs Acc +wider_eod = wider_code[wider_code['task_metric']=='eod'] +wider_EOD = ( + wider_eod.groupby(['task_dataset','task-dem'])[['equal_opp_diff','acc']].mean().reset_index() +) +dem_df= wider_eod['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_EOD=pd.concat([wider_eod,dem_df],axis=1) +wider_EOD=wider_EOD.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +# ploting the scatter plot for eod vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_EOD,x='acc',y='equal_opp_diff',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=0.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +output= os.path.join(GRAPHS,'eod_vs_acc_scatter.png') +plt.savefig(output, dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/dollarstreet_analysis.py b/fairnessbench_analysis/dollarstreet_analysis.py new file mode 100644 index 0000000..a31d9b7 --- /dev/null +++ b/fairnessbench_analysis/dollarstreet_analysis.py @@ -0,0 +1,61 @@ + +import pandas as pd + + +df_final = pd.read_csv("Final_step_perfomance2026-02-07T02:23:17.201613.csv") +df_final.head() + + +df_baseline = pd.read_csv("Baseline_cleaned_perfomance2026-02-07T02:23:19.806782.csv") +df_baseline.head(2) + +df_dollar_baseline = df_baseline[df_baseline['baseline_Advantaged'].notna() & df_baseline['baseline_Disadvantaged'].notna()] + +cols_to_keep = ['task', 'run_ts', 'run_id','baseline_Advantaged', 'baseline_Disadvantaged'] + +df_dollar_baseline = df_dollar_baseline[cols_to_keep] +df_dollar_baseline + + +df_dollar_baseline['model'] = 'baseline' +df_dollar_baseline + + +df_final.columns + + +cols_to_keep = ['model', 'task', 'run_ts', 'run_id','Advantaged', 'Disadvantaged'] + + +df_dollar_res = df_final[df_final['Advantaged'].notna() & df_final['Disadvantaged'].notna()] + + +df_dollar_res = df_dollar_res[cols_to_keep] +df_dollar_res.head(2) + + +df_dollar_res['model'].value_counts() + + +df_dollar_baseline = df_dollar_baseline.rename(columns={"baseline_Advantaged": "Advantaged", "baseline_Disadvantaged": "Disadvantaged"}) + + +df_combined = pd.concat([df_dollar_baseline, df_dollar_res], ignore_index=True) +df_combined.head(2) + +df_combined['model'] = df_combined['model'].replace('claude-3-7-sonnet-20250219', 'claude-3-7-sonnet') +df_combined + + +df_avg = (df_combined.groupby('model').agg(avg_adv_acc=("Advantaged", "mean"), +avg_disadv_acc=("Disadvantaged", "mean"), std_adv_acc=("Advantaged", "std"), +std_disadv_acc=("Disadvantaged", "std"), n_runs=("Advantaged", "count")).reset_index()) +df_avg + + +df_avg['disparity']= df_avg['avg_adv_acc']- df_avg['avg_disadv_acc'] +df_avg + + + + diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py new file mode 100644 index 0000000..35fdc81 --- /dev/null +++ b/fairnessbench_analysis/explode_results.py @@ -0,0 +1,111 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import json +from path import PROJECT_ROOT,CSV_FILES + +# loading the performance results +perf_path = PROJECT_ROOT +result_files = [ + os.path.join(perf_path, fname) + for fname in os.listdir(perf_path) + if os.path.isfile(os.path.join(perf_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +performance_df = pd.concat(result_list) +# Drop unsuccessful runs - keep only dict type (successful runs) +result_df_successful = performance_df[performance_df['final_score'].apply(lambda x: isinstance(x, dict))] +print(f"Total rows after filtering: {len(result_df_successful)}") +print(f"Rows dropped: {len(performance_df) - len(result_df_successful)}") + +end_series = lambda s: pd.Series(s[-5:]) +model_run = result_df_successful['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] + +# extracting the final and the steps performance scores for the results to save in a csv file +exploded_score = result_df_successful['final_score'].apply(pd.Series).reset_index() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['model', 'task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) + +# adding flake8 results to performnce df +flake8_df = result_df_successful[['path', 'final_flake8_score']].copy() +sps = flake8_df['path'].str.split('/').apply(end_series) +sps = sps.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) +flake8_df = flake8_df.join(sps[['model', 'task', 'run_ts']]) + +exploded_score = exploded_score.merge( + flake8_df[['model', 'task', 'run_ts', 'final_flake8_score']], + on=['model', 'task', 'run_ts'], + how='left' +) +# Function to get last 8 runs +def get_last_best8(df): + if len(df) > 8: + return df.sort_values('run_id').tail(8) + else: + return df + +# Keep only last 8 successful runs per task +exp_score_filtered = exploded_score.groupby(['task','model']).apply(get_last_best8).reset_index(drop=True) +print(f"\nFinal rows after keeping last 8 per task: {len(exp_score_filtered)}") + +output_file=os.path.join(CSV_FILES, 'Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exp_score_filtered.to_csv(output_file,index=False) + +# loading baseline results +result_path = '/scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/baseline_results' +result_files = [ + os.path.join(result_path, resjson) + for resjson in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, resjson)) +] + +result_list = [pd.read_json(rf).T for rf in result_files] +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-4:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['task','run_ts']).cumcount() +mr_keep = ['task','run_ts','run_id'] +exploded_score = result_df['final_score'].apply(pd.Series).reset_index() +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['task','run_ts']).cumcount() + +cols = [ 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in [ 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[3],axis=1) +cols_to_prefix = [col for col in exploded_score.columns if col not in ['task', 'run_ts', 'run_id']] +exploded_score = exploded_score.rename( + columns={col: f'baseline_{col}' for col in cols_to_prefix} +) + +output_file=os.path.join(CSV_FILES, 'Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py new file mode 100644 index 0000000..b456621 --- /dev/null +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -0,0 +1,45 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + + +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +sns.set_context(context='poster',font_scale=0.8) +g = sns.relplot(data=wider_code, + x='final_flake8_score', + y='total_llm_score', + col='model', + row='task_dataset', + hue='resrch_prob', + kind='scatter', + alpha=0.7, + height=4, + aspect=1) + +g.set_axis_labels('Flake8 Score', 'LLM Code Score') +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# add horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=85.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'performance_flake8_code.png') +plt.savefig(output,dpi=300) + + diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md new file mode 100644 index 0000000..4ea8652 --- /dev/null +++ b/fairnessbench_analysis/readme.md @@ -0,0 +1,53 @@ +# fairnessbench analysis + +This folder contains all the code and data for analyzing the fairnessbench results. +The main analysis script is explode_results.py, which loads the raw results data and creates clean CSV files ready for analysis. + +# A. Setup: + +**Local path configuration** +1. Create `paths.py` at the repo root. +2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. +3. `paths.py` is in `.gitignore`. +### Required variables in `paths.py` +Raw results are the original JSON outputs generated by running agents on benchmark tasks, containing performance metrics, fairness scores, and Flake8 scores before any processing or analysis. +- **PROJECT_ROOT** — Directory that contains all *raw results*. +- **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. +- **GRAPHS** — Directory where analysis scripts will save generated *figures/plots*. +- **FILES** — Directory that stores *CSV files from different evaluation models* (used by `cv_scores_evalmodels.py`). + +# B. Run Analysis: +**Run main file** +```python +python explode_results.py +``` +This will create the following files in the csv_files/ directory: +- Result_Final_code_clean*.csv: File contains raw scores and final scores from the llm evaluation on the training scripts(code). +- Result_Final_log_clean*: File contains raw scores and final scores from the llm evaluation on the reasoning process of the agent(log). +- Final_step_performance*.csv: File contains performance metric(e.g. accuracy,disparate impact etc.) scores of the models on each task. + +These files are then used for futher analysis. + +## Analysis: +In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. +To run an analysis, change the input CSV filename in the script to the file required for that analysis. +**Example:** To analyze different types of fairness for the Adult dataset, run `adult_fairness.py`. Before running it, update the script’s input CSV to the new file generated in the `csv_files/` directory. + +```python +python ....py +``` +## Key files: +- adult_fairness.py: Analyze the target fairness metrics used in the benchmark for the Adult dataset. It generates Figure 4 in section 4.2.2 +- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. This file generates Figure 6 in Section 4.3. +- di_across_datasets.py: Analyzes fairness (disparate impact (DI) and equal opportunity diff (EOB))and accuracy across datasets and research problems. It generates Figure 3 found in section 4.2.1. and figure 13, which is found in the appendix. +- comparing_flake8_bal_be_impli.py: Generates a bar plot (which is explained in section 4.3) of Flake8 performance for 3 research problems ( balance, best, implicit) for different models and datasets. +- target_selection.py: Analyze the model's performance and fairness metrics for the target selection tasks. It generates Figure 9 in section 4.5.1 +- target10_sucess_rate.py: Generates figure 5 in section 4.2.3 which shows how agent performance differs from the baseline on the Target10 research problem, by dataset and model. +- acc_di_tradeoff_heatmap.py: Generates a heatmap(Figure 7, section 4.3) that shows the tradeoff between accuracy and fairness (disparate impact) for 3 research problems ( balance, best, implicit) using the Pareto frontier. +- acc_fairness_overlap_heatmap.py: Analyses the variation in datasets (randoadult,sampadult, nondescriptive, health) to the adult dataset. It generates Figures 8 and 12 in section 4.4 +- run_count.py: Analyzes the number of completed and successful runs for each model and dataset. It generates table 4 and 10 in appendix +- sensitivity_analysis.py: code analyzing the LLM agent's performance and fairness for different versions of the same prompt. This file genaerates figure 10 and 11 in section 4.6. +- dollarstreet_analysis.py: codebase for analyzing the performance across income levels which is explained in Section 4.5.2 + + + diff --git a/fairnessbench_analysis/run_count.py b/fairnessbench_analysis/run_count.py new file mode 100644 index 0000000..e20d7e3 --- /dev/null +++ b/fairnessbench_analysis/run_count.py @@ -0,0 +1,66 @@ +import pandas as pd +import os +from datetime import datetime + +# Read and combine all result files +result_path = '/scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/final_results' +result_files = [ + os.path.join(result_path, resjson) + for resjson in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, resjson)) +] +result_list = [pd.read_json(rf).T for rf in result_files] +result_df = pd.concat(result_list) +print(f"Total rows before filtering: {len(result_df)}") + +end_series = lambda s: pd.Series(s[-5:]) +sp = result_df['path'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate(['model', 'task', 'run_ts'])}) +exp_score = result_df.join(sp[['model', 'task', 'run_ts']]) +exp_score['run_id'] = exp_score.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_score = exp_score[cols] + + +no_final = ~exp_score["final_score"].apply(lambda x: isinstance(x, dict)) +has_final = ~no_final +no_err = exp_score["error"] == "" +time = exp_score["total_time"] > 0 + +# aggregate counts per model/task +summary = ( + exp_score + .assign( + has_final_score = has_final, + time_no_error = (time & no_err) + ) + .groupby(["model", "task"]) + .agg( + runs=("run_ts", "count"), + sucessful_runs=("has_final_score", "sum"), + completed_runs =('time_no_error','sum') + ) + .reset_index() +) +task_decomp = summary['task'].str.split('_').apply(pd.Series).rename( + columns={i:col for i,col in enumerate(['dataset','target_metric','task-dem'])}) +task_dem = task_decomp['task-dem'].str.split('-').apply(pd.Series).rename( + columns={i:col for i,col in enumerate(['research_problem','dem'])}) + +df = pd.concat([summary,task_decomp,task_dem],axis=1) +df +new_df = ( + df.groupby(['model','research_problem'])[ + ['runs','completed_runs','sucessful_runs'] + ] + .sum() + .reset_index() +) + +new_df['success_rate'] = new_df['sucessful_runs'] / new_df['runs'] +#new_df['completion_rate'] = new_df['completed_runs'] / new_df['runs'] +print(new_df.to_latex()) + +m = new_df.groupby(['model','research_problem'])['success_rate'].mean().unstack() +print(m.to_latex()) diff --git a/fairnessbench_analysis/sensitivity_analysis.py b/fairnessbench_analysis/sensitivity_analysis.py new file mode 100644 index 0000000..92f24ab --- /dev/null +++ b/fairnessbench_analysis/sensitivity_analysis.py @@ -0,0 +1,216 @@ + +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + + +df_final = pd.read_csv("Final_step_perfomance2026-02-07T02:23:17.201613.csv") +#df_baseline = pd.read_csv("Baseline_cleaned_perfomance2026-02-07T02:23:19.806782.csv") + + +df_final.columns + + +df_final.head(2) + + +#df_baseline.head(2) + + +cols_to_keep = ['model', 'task', 'run_ts', 'run_id', 'acc', 'precision', 'recall', 'di', 'statistical_parity_diff', +'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] + + +df_final = df_final[cols_to_keep] +df_final.head(2) + +list_of_values = ['adult_balance-eod-sex', 'adult_eod_balance-sex', 'adult_balance-eod-nosuccess-sex', +'adult_balance-eod-shortgoal-sex', 'adult_balance-eod-noreq-sex', 'adult_balance-eod-nochange-sex', 'adult_balance-eod-nohow-sex', +'adult_balance-eod-rephrased01-sex', 'adult_balance-eod-rephrased10-sex', 'adult_balance-eod-rephrased06-sex', +'adult_balance-eod-rephrased05-sex', 'adult_balance-eod-rephrased03-sex', 'adult_balance-eod-altmetricdetail-sex', +'adult_balance-eod-altmetricdetail2-sex'] + +filtered_df = df_final[df_final['task'].isin(list_of_values)] +filtered_df + + +filtered_df['task']=filtered_df['task'].replace('adult_eod_balance-sex', 'adult_balance-eod-original-sex') + + +filtered_df['task'].value_counts() + + +data_task = filtered_df['task'].str.split('_').apply(pd.Series).rename(columns={0: 'data', 1: 'task_info'}) +data_task + + +data_task2 = data_task['task_info'].str.split('-').apply(pd.Series).rename(columns={0:'rp', 1:'f_metric', 2:'prompt_variation',3:'dem'}) +data_task2 + + +df_wide = pd.concat([filtered_df, data_task, data_task2],axis=1) +cols=['model','data','rp','f_metric','prompt_variation','dem', 'run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +df_wide = df_wide[cols] +df_wide=df_wide.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +df_wide.head() + + + +def get_metric(df, model, rp, data, f_metric, prompt_variation): + return df[ + (df["model"]==model) & + (df["rsch_prob"]==rp) & + (df["task_dataset"]==data)& + (df["prompt_variation"]==prompt_variation) + ][metric].values + + +df_wide['prompt_variation'].value_counts() + + +count = df_wide.groupby(['model', 'prompt_variation'])["rp"].count() + + +count.to_csv("counts.csv") + + +df_wide = df_wide[df_wide["prompt_variation"] != "noreq"] + + +df_wide['prompt_variation'].value_counts() + + +df_wide["prompt_variation"] = df_wide["prompt_variation"].replace({ + 'altmetricdetail': 'altmetricnames', + 'altmetricdetail2': 'informalgoal', + 'rephrased01': 'informaldirect', + 'rephrased03': 'altnowork', + 'rephrased05': 'verbosedetail', + 'rephrased06': 'informationalpassive', + 'rephrased10': 'passivedata'}) + + +df_wide['prompt_variation'].unique() + + +summary = ( + df_wide.groupby(["model", "rp", "prompt_variation"]) + .agg( + mean_acc=("acc","mean"), + mean_di=("di","mean"), + std_acc=("acc","std"), + std_di=("di","std") + ) + .reset_index() +) +summary + +allowed_prompts = ['altmetricnames', 'informalgoal', 'nochange', 'nohow', 'nosuccess', 'informaldirect', 'altnowork', +'verbosedetail', 'informationalpassive', 'passivedata', 'shortgoal'] + +results = [] + +for (model, rp), group in summary.groupby(["model", "rp"]): + + # baseline = original prompt + base_row = group[group["prompt_variation"] == "original"] + + if base_row.empty: + continue + + base_row = base_row.iloc[0] + + # baseline interval + base_min_di = base_row["mean_di"] - base_row["std_di"] + base_max_di = base_row["mean_di"] + base_row["std_di"] + base_min_acc = base_row["mean_acc"] - base_row["std_acc"] + base_max_acc = base_row["mean_acc"] + base_row["std_acc"] + + for _, row in group.iterrows(): + + prompt = row["prompt_variation"] + + if prompt == "original": + continue + + if prompt not in allowed_prompts: + continue + + # variation interval + row_min_di = row["mean_di"] - row["std_di"] + row_max_di = row["mean_di"] + row["std_di"] + row_min_acc = row["mean_acc"] - row["std_acc"] + row_max_acc = row["mean_acc"] + row["std_acc"] + + results.append({ + "model": model, + "rsch_prob": rp, + "baseline_prompt": "original", + "comparison_prompt": prompt, + # mean shifts + "fairness_diff": abs(base_row["mean_di"] - row["mean_di"]), + "accuracy_diff": abs(base_row["mean_acc"] - row["mean_acc"]), + # overlap between two intervals + "overlap_fair": max(0, min(base_max_di, row_max_di) - max(base_min_di, row_min_di)), + "overlap_acc": max(0, min(base_max_acc, row_max_acc) - max(base_min_acc, row_min_acc)), + # baseline interval length + "len_fair_base": base_max_di - base_min_di, + "len_acc_base": base_max_acc - base_min_acc}) + +sens_df = pd.DataFrame(results) + + +sens_df['comparison_prompt'].unique() + + +sens_df['final_overlap_fair'] = sens_df['overlap_fair'] / sens_df['len_fair_base'] +sens_df['final_overlap_acc'] = sens_df['overlap_acc'] / sens_df['len_acc_base'] +sens_df + + +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_prompt", + values="final_overlap_fair", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) +plt.title("Fairness Sensitivity Across Prompt Variations") +plt.xlabel("Comparison Prompts") +plt.ylabel("Research Problem | Model") + +plt.savefig("prompt_sensitivity_fair_overlap.png", dpi=200, bbox_inches="tight") + + +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_prompt", + values="final_overlap_acc", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) + +plt.title("Accuracy Sensitivity Across Prompt Variants") +plt.xlabel("Comparison Prompts") +plt.ylabel("Research Problem | Model") +plt.savefig("prompt_sensitivity_acc_overlap.png", dpi=200, bbox_inches="tight") diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py new file mode 100644 index 0000000..5c55146 --- /dev/null +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -0,0 +1,110 @@ +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +perf_df = pd.read_csv(CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv') + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +#perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) + +# loading baseline + +baseline_df = pd.read_csv(CSV_FILES/'Baseline_cleaned_perfomance2026-02-07T02:23:19.806782.csv') + +base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', + 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', + 'baseline_false_omission_rate_diff'] +#baseline_df= baseline_df.dropna(subset=base, how='all') +baseline_df= baseline_df.fillna(0) +baseline_df= baseline_df.drop(columns=['run_ts','run_id','baseline_score_count']) + +# merging both dfs +merged_results= perf_df.merge(baseline_df, how='left',on=['task']) +merged_results= merged_results.dropna(how='any') +# rearranging cols +task_data_metric = merged_results['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_10 = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resch_prob',1:'dem'}) +wider = pd.concat([task_10, task_data_metric],axis=1) +col= ['task_dataset','task_metric','resch_prob','dem'] +wider=wider[col] +clean_df = pd.concat([merged_results, wider], axis=1) + +columns=['model','task','task_dataset','task_metric','resch_prob','dem','run_ts','run_id', +'acc','baseline_acc','precision','baseline_precision','recall', +'baseline_recall','di','baseline_di','statistical_parity_diff','baseline_statistical_parity_diff','equal_opp_diff', +'baseline_equal_opp_diff','error_rate_diff','baseline_error_rate_diff','error_rate_ratio','baseline_error_rate_ratio', +'false_omission_rate_diff','baseline_false_omission_rate_diff'] +clean_df=clean_df[columns] + +# filtering target10 task +df= clean_df[clean_df['resch_prob'] == 'target10'] + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} + +metric_best = { + 'acc': 1, + 'di': 1, + 'spd': 0, + 'eod': 0, + 'err' :1, + 'erd' : 0, + 'ford': 0, +} + +# subtract diff directions so that + is improvement and - is worse in result +metric_best_fx = { + 'acc': lambda r: r['task_metric_value'] - r['task_metric_value_baseline'], + 'di': lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'spd': lambda r: r['task_metric_value_baseline'] - r['task_metric_value'], + 'eod': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'err' : lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'erd' : lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'ford': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], +} + +imp_text = {True:'improvement', False:'no improvement'} +def improvement(r): + return imp_text[r['agent-improvement']>0] + +df.loc[:,'task_metric_value'] = df.apply(lambda r: r[metric_map[r['task_metric']]],axis=1) +df.loc[:,'task_metric_value_baseline'] = df.apply(lambda r: r['baseline_'+metric_map[r['task_metric']]],axis=1) +df.loc[:,'agent-baseline'] = df.loc[:,'task_metric_value'] - df.loc[:,'task_metric_value_baseline'] +df.loc[:,'agent-improvement'] = df.apply(lambda r: metric_best_fx[r['task_metric']](r),axis=1) +df.loc[:,'agent-impact'] = df.apply(improvement,axis=1) + +def success(s): + return sum(s>.1) + +def total(s): + return len(s) + +def improvement(s): + return sum(s>0) + +df_improvement_stats = df.groupby(['model','task_dataset',])['agent-improvement'].agg(['mean',success,total,improvement]).reset_index() +df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], + value_vars=['total','success','improvement'],var_name='count_type',value_name='count') +sns.set_context(context='poster',font_scale = .5) +output= os.path.join(GRAPHS,'target10_success.png' ) +g=sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar') +# Rotate x-axis labels +for ax in g.axes.flatten(): + ax.tick_params(axis='x', labelrotation=35) + g.savefig(output) + diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py new file mode 100644 index 0000000..9e22e35 --- /dev/null +++ b/fairnessbench_analysis/target_selection.py @@ -0,0 +1,61 @@ +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +res = pd.read_csv(file) + +task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([res, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +wider_code = wider_code[wider_cols] + +wider_adrecon = wider_code[wider_code['task_dataset']=='adrecon'].reset_index(drop=True) + +# % of times (per model/etc) that actually gets a final result +allmetrics = ['acc', 'precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', + 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] + +wider_adrecon['all_metric_vals'] = wider_adrecon[allmetrics].notna().all(axis=1) + +res = wider_adrecon.groupby('model')['all_metric_vals'].mean() * 100 +res.round(2) +wider_adrecon = wider_adrecon.dropna() + +# regular performance of final models +wider_adrecon = wider_adrecon.rename(columns={'statistical_parity_diff': 'spd', 'equal_opp_diff': 'eod', + 'error_rate_diff': 'erd', 'error_rate_ratio': 'err', + 'false_omission_rate_diff': 'ford', 'precision': 'p', + 'recall': 'r'}) + +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] +adrec_res = (wider_adrecon.groupby(['model','task-dem'])[allmet].mean()).reset_index() +adrec_task_dem= adrec_res['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +adrec_res=pd.concat([adrec_res,adrec_task_dem],axis=1) +adrec_res=adrec_res.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] + +adrec_long = pd.melt(adrec_res, id_vars=['model', 'task-dem', 'resrch_prob', 'dem'], value_vars= allmet, var_name='task_metric', value_name='task_metric_value') +adrec_long['model-dem'] = adrec_long['model'] + '-' + adrec_long['dem'] +adrec_long = adrec_long.rename(columns={'task_metric': 'task_metrics'}) + +high_good = ['acc', 'p', 'r', 'di','err'] # +low_good = ['spd', 'eod', 'erd', 'ford'] + +metric_rename = {} + +for m in high_good: + metric_rename[m] = f"{m} ↑" + +for m in low_good: + metric_rename[m] = f"{m} ↓" + +adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) +g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) + +output= os.path.join(GRAPHS,"adrec_allmetric.png") +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file