From 94c11b04fabaefe2cdc51f17e00256d28af2cf8a Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Wed, 6 Aug 2025 08:06:02 +0000 Subject: [PATCH 1/9] eval anylysis files --- fairnessbench_analysis/Deepseek_cv.csv | 3 + fairnessbench_analysis/Gemma_cv.csv | 8 + fairnessbench_analysis/Granite_cv.csv | 3 + .../adult_di_code_llmeval.py | 53 ++++++ fairnessbench_analysis/adult_fairness.py | 58 +++++++ fairnessbench_analysis/balancing_fairness.py | 57 +++++++ fairnessbench_analysis/code_log_llm_eval.py | 46 ++++++ .../correlation_flake8_code.py | 51 ++++++ .../cv_scores_evalmodels.py | 17 ++ fairnessbench_analysis/di_across_datasets.py | 41 +++++ fairnessbench_analysis/explode_results.py | 154 ++++++++++++++++++ .../performance_flake8_code.py | 38 +++++ .../target10_sucess_rate.py | 101 ++++++++++++ fairnessbench_analysis/target_selection.py | 54 ++++++ 14 files changed, 684 insertions(+) create mode 100644 fairnessbench_analysis/Deepseek_cv.csv create mode 100644 fairnessbench_analysis/Gemma_cv.csv create mode 100644 fairnessbench_analysis/Granite_cv.csv create mode 100644 fairnessbench_analysis/adult_di_code_llmeval.py create mode 100644 fairnessbench_analysis/adult_fairness.py create mode 100644 fairnessbench_analysis/balancing_fairness.py create mode 100644 fairnessbench_analysis/code_log_llm_eval.py create mode 100644 fairnessbench_analysis/correlation_flake8_code.py create mode 100644 fairnessbench_analysis/cv_scores_evalmodels.py create mode 100644 fairnessbench_analysis/di_across_datasets.py create mode 100644 fairnessbench_analysis/explode_results.py create mode 100644 fairnessbench_analysis/performance_flake8_code.py create mode 100644 fairnessbench_analysis/target10_sucess_rate.py create mode 100644 fairnessbench_analysis/target_selection.py diff --git a/fairnessbench_analysis/Deepseek_cv.csv b/fairnessbench_analysis/Deepseek_cv.csv new file mode 100644 index 0000000..22fefe5 --- /dev/null +++ b/fairnessbench_analysis/Deepseek_cv.csv @@ -0,0 +1,3 @@ +model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing +claude-3-7-sonnet-20250219,adult_err_balance-race,1.1035347137172349,0.2793492449300636,0.1818753236896995,0.15465752264217267,0.23111601141098256 +qwen,adult_err_balance-race,0.0,0.29632197530102916,0.3853887243714261,0.4880266796833574,0.34656317045517515 diff --git a/fairnessbench_analysis/Gemma_cv.csv b/fairnessbench_analysis/Gemma_cv.csv new file mode 100644 index 0000000..f9f85a4 --- /dev/null +++ b/fairnessbench_analysis/Gemma_cv.csv @@ -0,0 +1,8 @@ +model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing +claude-3-7-sonnet-20250219,adult_err_balance-race,0.15946509441945983,0.11649810905340587,0.0,0.15946509441945983,0.0 +claude-3-7-sonnet-20250219,adult_err_balance-sex,0.17494570236436235,0.0,0.0,0.1957400731715678,0.0 +gpt-4o,adult_err_balance-race,0.0,0.0,0.0,0.0,0.0 +gpt-4o,adult_err_balance-sex,0.0,0.0,0.0,0.0,0.0 +llama,adult_err_balance-race,0.20573779994945587,0.19716158838352976,0.0,0.12297509238026914,0.0 +llama,adult_err_balance-sex,0.3043212760213842,0.12426253043692712,0.0,0.16886551261045996,0.0 +qwen,adult_err_balance-sex,0.2138089935299395,0.19716158838352973,0.0,0.19716158838352973,0.0 diff --git a/fairnessbench_analysis/Granite_cv.csv b/fairnessbench_analysis/Granite_cv.csv new file mode 100644 index 0000000..e0023d9 --- /dev/null +++ b/fairnessbench_analysis/Granite_cv.csv @@ -0,0 +1,3 @@ +model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing +claude-3-7-sonnet-20250219,adult_err_balance-race,0.04932502891543654,0.09584211726899525,0.14691056734678462,0.0,0.04844009143018392 +qwen,adult_err_balance-race,0.18672359914948844,1.282654434033444,0.2803402154503214,0.062112999374994156,0.28247912462432095 diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py new file mode 100644 index 0000000..96ffbce --- /dev/null +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -0,0 +1,53 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +# Removing missing rows fairnessBench +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +# filtering the adult dataset and the di task_metric +adult= wider_code[wider_code['task_dataset']=='adult'] +adult_di=adult[adult['task_metric']=='di'] + +long_df = adult_di.melt( + id_vars=['model','task_dataset','task_metric','resrch_prob','dem'], + value_vars=[ + '1. Data Collection and Processing', + '2. Bias Detection and Mitigation', + '3. Fairness Metric Selection', + '4. Model Selection and Training', + '5. Evaluation and Testing' + ], + var_name='rubric_section', + value_name='score' +) + +sns.set_context(context='poster',font_scale=1.0) +plt.figsize=(16,12) +m=sns.catplot( + data=long_df, + x="rubric_section", + y="score", + hue="model", + col="resrch_prob", + row='dem', + kind="bar", + aspect=2 +) +m.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +ax=m.axes +ax = m.axes +for ax in m.axes.flatten(): + plt.setp(ax.get_xticklabels(), rotation=30) + ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) +plt.savefig('adult_di_code_llm_eval.png', dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py new file mode 100644 index 0000000..bef5c41 --- /dev/null +++ b/fairnessbench_analysis/adult_fairness.py @@ -0,0 +1,58 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +# Filtering only adult dataset from the dataframe +wider_adult = wider_code[wider_code['task_dataset']=='adult'] +fairness_metrics= ['di','error_rate_ratio','statistical_parity_diff','equal_opp_diff','error_rate_diff','false_omission_rate_diff'] +wider_ADULT = ( + wider_adult.groupby(['model','task-dem','task_metric'])[fairness_metrics].mean() +).reset_index() +ad_df= wider_ADULT['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_ADULT=pd.concat([wider_ADULT,ad_df],axis=1) +wider_ADULT=wider_ADULT.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_ADULT.loc[:, 'task_metric_value'] = wider_ADULT.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.catplot(data=wider_ADULT,x='resrch_prob',y='task_metric_value',hue='dem',row='task_metric',col='model',kind='bar' + ,aspect=1) + +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +ax=g.axes +for i in range(4): + ax[0,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[1,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[2,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) + +plt.savefig('adult_fairness.png',dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py new file mode 100644 index 0000000..d77f134 --- /dev/null +++ b/fairnessbench_analysis/balancing_fairness.py @@ -0,0 +1,57 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider=wider[cols] + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob']=='balance'] +wider_balance=wider_balance.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_balance= wider_balance.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_balance.loc[:, 'task_metric_value'] = wider_balance.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_balance,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +plt.savefig('balancing_fairness.png',dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py new file mode 100644 index 0000000..64618f6 --- /dev/null +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -0,0 +1,46 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +log_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Results_Final_log_clean2025-08-06T04:22:17.377479.csv') +perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') + +# Removing missing rows +code_eval= code_eval.dropna(how="any") +code_eval = code_eval.fillna(0) +log_eval= log_eval.dropna(how='any') +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.fillna(0) + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','final_flake8_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +wider_code = wider_code[wider_cols] + +score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] +code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') +sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig('codeval') + + +# log eval +task_data_metric = log_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_log = pd.concat([log_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id',"1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +wider_log = wider_log[wider_cols] +wider_log.head() + +score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] +log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], + value_vars=score_cols,var_name='score') +sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig('logval') + diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py new file mode 100644 index 0000000..c8c15fa --- /dev/null +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -0,0 +1,51 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +# Correlation between flake8 and code llm eval on claude_adult_di_erd task +code_cols=['1. Data Collection and Processing','2. Bias Detection and Mitigation','3. Fairness Metric Selection','4. Model Selection and Training', '5. Evaluation and Testing'] +group_cols = ["model", "task_dataset", "resrch_prob", "task_metric"] # Add 'task_dem' if needed + +def flake8_corr_matrix(group): + # Compute correlation between flake8_score and each rubric section + corrs = [group["final_flake8_score"].corr(group[rubric]) for rubric in code_cols] + return pd.Series(corrs, index=code_cols) + +corrs = ( + wider_code.groupby(group_cols) + .apply(flake8_corr_matrix) + .reset_index() +) +corrs=corrs.fillna(0) + +group_filter = ( + (corrs['model'] == 'claude-3-7-sonnet-20250219') & + (corrs['task_dataset'] == 'adult') & + (corrs['resrch_prob'] == 'balance') & + (corrs['task_metric'] == 'erd') +) +corr_row = corrs.loc[group_filter, code_cols] + +plt.figure(figsize=(8, 2)) +sns.heatmap( + corr_row.values.reshape(1, -1), + annot=True, + cmap='coolwarm', + xticklabels=code_cols, + yticklabels=['Flake8 score'] +) +plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") +plt.savefig('flake8_vs_code_correlation.png',bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py new file mode 100644 index 0000000..7d2979b --- /dev/null +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -0,0 +1,17 @@ +import pandas as pd + +gemma_df = pd.read_csv('../fairnessBench/fairnessbench_analysis/Gemma_cv.csv') +deepseek_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Deepseek_cv.csv') +granite_df=pd.read_csv('../fairnessBench/fairnessbench_analysis/Granite_cv.csv') + +gemma_df['eval'] = 'gemma' +deepseek_df['eval'] = 'deepseek' +granite_df['eval'] = 'granite' + +cols = ['eval', 'model', 'task'] + [c for c in gemma_df.columns if c not in ['eval', 'model', 'task']] +gemma_df = gemma_df[cols] +deepseek_df = deepseek_df[cols] +granite_df = granite_df[cols] + +all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True) +all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False) \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py new file mode 100644 index 0000000..5226b39 --- /dev/null +++ b/fairnessbench_analysis/di_across_datasets.py @@ -0,0 +1,41 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_alt= perf_alt.dropna(subset=perf, how='all') +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +# Filtering only DI from the dataframe +wider_di = wider_code[wider_code['task_metric']=='di'] +wider_DI = ( + wider_di.groupby(['task_dataset','task-dem'])[['di','acc']].mean().reset_index() +) +dem_df= wider_di['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_DI=pd.concat([wider_di,dem_df],axis=1) +wider_DI=wider_DI.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# ploting the scatter plot for di vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_DI,x='acc',y='di',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +plt.savefig('di_vs_acc_scatter.png', dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py new file mode 100644 index 0000000..ee328d5 --- /dev/null +++ b/fairnessbench_analysis/explode_results.py @@ -0,0 +1,154 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import json + +# loading the performance results +perf_path = '../results_manually_combined' +result_files = [ + os.path.join(perf_path, fname) + for fname in os.listdir(perf_path) + if os.path.isfile(os.path.join(perf_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +performance_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-5:]) +model_run = performance_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] + +# extracting the performance scores for the results to save in a csv file +exploded_score = performance_df['final_score'].apply(pd.Series).reset_index().drop(columns=[0]) +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['model', 'task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) +exploded_score.to_csv('Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) + + +# loading llm eval results +result_path = '../results_final_total' +result_files = [ + os.path.join(result_path, fname) + for fname in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, fname)) +] +result_list = [] +for rf in result_files: + try: + if os.path.getsize(rf) == 0: + print(f"Skipping empty file: {rf}") + continue + df = pd.read_json(rf).T + result_list.append(df) + except Exception as e: + print(f"Skipping file {rf} due to error: {e}") +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-5:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['model','task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['model','task']).cumcount() +mr_keep = ['model','task','run_ts','run_id'] +# extracting llm code evaluation +raw_df= result_df[["final_llm_score"]].explode('final_llm_score',)['final_llm_score'].apply(pd.Series).reset_index().drop(columns=[0]) +exp_code= raw_df["raw_scores"].apply(pd.Series).drop(columns=[0]) +exp_code = raw_df.join(raw_df["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) +splits = exp_code['index'].str.split('/').apply(end_series) +splits = splits.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exp_code = exp_code.join(splits[['model', 'task', 'run_ts']]) + +exp_code['run_id'] = exp_code.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_code.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_code = exp_code[cols] +exp_code = exp_code.drop(exp_code.columns[4],axis=1) + +# adding flake8 results to the code llm eval df +flake8_df = result_df[['path', 'final_flake8_score']].copy() +sps = flake8_df['path'].str.split('/').apply(end_series) +sps = sps.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) +flake8_df = flake8_df.join(sps[['model', 'task', 'run_ts']]) + +# merging both dfs +exp_code = exp_code.merge( + flake8_df[['model', 'task', 'run_ts', 'final_flake8_score']], + on=['model', 'task', 'run_ts'], + how='left' +) +exp_code.to_csv('Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) + +# extracting log llm eval results +raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) +exp_log= raw_log["raw_scores"].apply(pd.Series).drop(columns = [0]) +exp_log = raw_log.join(raw_log["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) +exp_log = exp_log.rename(columns={"total_llm_score":"total_log_score"}) +split = exp_log['index'].str.split('/').apply(end_series) +split = split.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) + +exp_log = exp_log.join(split[['model', 'task', 'run_ts']]) + +exp_log['run_id'] = exp_log.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_log.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_log = exp_log[cols] +exp_log = exp_log.drop(exp_log.columns[4],axis=1) +#exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) + +# loading baseline results +result_path = '../sanity_results' +result_files = [ + os.path.join(result_path, resjson) + for resjson in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, resjson)) +] + +result_list = [pd.read_json(rf).T for rf in result_files] +result_df = pd.concat(result_list) + +end_series = lambda s: pd.Series(s[-4:]) +model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = + {i:c for i,c in enumerate(['task','run_ts'])}) + +model_run['run_id']= model_run.groupby(['task','run_ts']).cumcount() +mr_keep = ['task','run_ts','run_id'] + +exploded_score = result_df[['score']].explode('score',)['score'].apply(pd.Series).reset_index().drop(columns = [0]) +exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +sp = exploded_score['index'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate([ 'task', 'run_ts'])}) + +exploded_score = exploded_score.join(sp[['task', 'run_ts']]) + +exploded_score['run_id'] = exploded_score.groupby(['task','run_ts']).cumcount() + +cols = [ 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in [ 'task', 'run_ts', 'run_id']] +exploded_score = exploded_score[cols] +exploded_score = exploded_score.drop(exploded_score.columns[3],axis=1) +cols_to_prefix = [col for col in exploded_score.columns if col not in ['task', 'run_ts', 'run_id']] +exploded_score = exploded_score.rename( + columns={col: f'baseline_{col}' for col in cols_to_prefix} +) +exploded_score.to_csv('Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py new file mode 100644 index 0000000..c142544 --- /dev/null +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -0,0 +1,38 @@ +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + +# Loading useful dataframes +code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +# Removing missing rows +code_eval= code_eval.dropna(how="any") + +task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1) +wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing",'final_flake8_score'] +wider_code = wider_code[wider_cols] + +sns.set_context(context='poster',font_scale=0.8) +g = sns.relplot(data=wider_code, + x='final_flake8_score', + y='total_llm_score', + col='model', + row='task_dataset', + hue='resrch_prob', + kind='scatter', + alpha=0.7, + height=4, + aspect=1) + +g.set_axis_labels('Flake8 Score', 'LLM Code Score') +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# add horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) +plt.savefig('performance_flake8_code.png',dpi=300) + + diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py new file mode 100644 index 0000000..c0117a0 --- /dev/null +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -0,0 +1,101 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') + +# Removing missing rows +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff'] +perf_df= perf_df.dropna(subset=perf, how='all') +perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) + +# loading baseline +baseline_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Baseline_cleaned_perfomance2025-08-06T06:09:53.582383.csv') + +base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', + 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', + 'baseline_false_omission_rate_diff'] +baseline_df= baseline_df.dropna(subset=base, how='all') +baseline_df= baseline_df.fillna(0) +baseline_df= baseline_df.drop(columns=['run_ts','run_id','baseline_score_count']) + +# merging both dfs +merged_results= perf_df.merge(baseline_df, how='left',on=['task']) +merged_results= merged_results.dropna(how='any') +# rearranging cols +task_data_metric = merged_results['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +task_10 = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resch_prob',1:'dem'}) +wider = pd.concat([task_10, task_data_metric],axis=1) +col= ['task_dataset','task_metric','resch_prob','dem'] +wider=wider[col] +clean_df = pd.concat([merged_results, wider], axis=1) + +columns=['model','task','task_dataset','task_metric','resch_prob','dem','run_ts','run_id', +'acc','baseline_acc','precision','baseline_precision','recall', +'baseline_recall','di','baseline_di','statistical_parity_diff','baseline_statistical_parity_diff','equal_opp_diff', +'baseline_equal_opp_diff','error_rate_diff','baseline_error_rate_diff','error_rate_ratio','baseline_error_rate_ratio', +'false_omission_rate_diff','baseline_false_omission_rate_diff','score_count'] +clean_df=clean_df[columns] + +# filtering target10 task +df= clean_df[clean_df['resch_prob'] == 'target10'] + +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} + +metric_best = { + 'acc': 1, + 'di': 1, + 'spd': 0, + 'eod': 0, + 'err' :1, + 'erd' : 0, + 'ford': 0, +} + +# subtract diff directions so that + is improvement and - is worse in result +metric_best_fx = { + 'acc': lambda r: r['task_metric_value'] - r['task_metric_value_baseline'], + 'di': lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'spd': lambda r: r['task_metric_value_baseline'] - r['task_metric_value'], + 'eod': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'err' : lambda r: abs(1- r['task_metric_value_baseline'])- abs(1- r['task_metric_value']), + 'erd' : lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], + 'ford': lambda r:r['task_metric_value_baseline'] - r['task_metric_value'], +} + +imp_text = {True:'improvement', False:'no improvement'} +def improvement(r): + return imp_text[r['agent-improvement']>0] + +df.loc[:,'task_metric_value'] = df.apply(lambda r: r[metric_map[r['task_metric']]],axis=1) +df.loc[:,'task_metric_value_baseline'] = df.apply(lambda r: r['baseline_'+metric_map[r['task_metric']]],axis=1) +df.loc[:,'agent-baseline'] = df.loc[:,'task_metric_value'] - df.loc[:,'task_metric_value_baseline'] +df.loc[:,'agent-improvement'] = df.apply(lambda r: metric_best_fx[r['task_metric']](r),axis=1) +df.loc[:,'agent-impact'] = df.apply(improvement,axis=1) + +def success(s): + return sum(s>.1) + +def total(s): + return len(s) + +def improvement(s): + return sum(s>0) + +df_improvement_stats = df.groupby(['model','task_dataset',])['agent-improvement'].agg(['mean',success,total,improvement]).reset_index() +df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], + value_vars=['total','success','improvement'],var_name='count_type',value_name='count') +sns.set_context(context='poster',font_scale = .5) +sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig('target10_success.png') + diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py new file mode 100644 index 0000000..16234b5 --- /dev/null +++ b/fairnessbench_analysis/target_selection.py @@ -0,0 +1,54 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +res = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([res, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_code = wider_code[wider_cols] + +wider_adrecon = wider_code[wider_code['task_dataset']=='adrecon'].reset_index(drop=True) + +# % of times (per model/etc) that actually gets a final result +allmetrics = ['acc', 'precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', + 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] + +wider_adrecon['all_metric_vals'] = wider_adrecon[allmetrics].notna().all(axis=1) + +res = wider_adrecon.groupby('model')['all_metric_vals'].mean() * 100 +res.round(2) +wider_adrecon = wider_adrecon.dropna() + +# regular performance of final models +wider_adrecon = wider_adrecon.rename(columns={'statistical_parity_diff': 'spd', 'equal_opp_diff': 'eod', + 'error_rate_diff': 'erd', 'error_rate_ratio': 'err', + 'false_omission_rate_diff': 'ford', 'precision': 'p', + 'recall': 'r'}) + +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] +adrec_res = (wider_adrecon.groupby(['model','task-dem'])[allmet].mean()).reset_index() +adrec_task_dem= adrec_res['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +adrec_res=pd.concat([adrec_res,adrec_task_dem],axis=1) +adrec_res=adrec_res.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +allmet = ['acc', 'p', 'r', 'di', 'spd', 'eod', 'erd', 'err', 'ford'] + +adrec_long = pd.melt(adrec_res, id_vars=['model', 'task-dem', 'resrch_prob', 'dem'], value_vars= allmet, var_name='task_metric', value_name='task_metric_value') +adrec_long['model-dem'] = adrec_long['model'] + '-' + adrec_long['dem'] +adrec_long = adrec_long.rename(columns={'task_metric': 'task_metrics'}) + +high_good = ['acc', 'p', 'r', 'di','err'] # +low_good = ['spd', 'eod', 'erd', 'ford'] + +metric_rename = {} + +for m in high_good: + metric_rename[m] = f"{m} ↑" + +for m in low_good: + metric_rename[m] = f"{m} ↓" + +adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) +g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) +plt.savefig("adrec_allmetric.png",dpi=400,bbox_inches='tight') \ No newline at end of file From a6694288be49aad6593ba57d5b68eff178a8fd86 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 13 Aug 2025 14:22:43 +0000 Subject: [PATCH 2/9] Organized how analysis outout charts and where it takes CSV files from --- .gitignore | 3 ++- .../adult_di_code_llmeval.py | 8 +++++++- fairnessbench_analysis/adult_fairness.py | 9 +++++++-- fairnessbench_analysis/balancing_fairness.py | 8 +++++++- fairnessbench_analysis/code_log_llm_eval.py | 12 ++++++++--- .../correlation_flake8_code.py | 8 +++++++- .../{ => csv_files}/Deepseek_cv.csv | 0 .../{ => csv_files}/Gemma_cv.csv | 0 .../{ => csv_files}/Granite_cv.csv | 0 .../cv_scores_evalmodels.py | 9 ++++++--- fairnessbench_analysis/di_across_datasets.py | 8 +++++++- fairnessbench_analysis/explode_results.py | 20 +++++++++++++++---- .../performance_flake8_code.py | 9 ++++++++- .../target10_sucess_rate.py | 10 ++++++++-- fairnessbench_analysis/target_selection.py | 9 ++++++++- 15 files changed, 92 insertions(+), 21 deletions(-) rename fairnessbench_analysis/{ => csv_files}/Deepseek_cv.csv (100%) rename fairnessbench_analysis/{ => csv_files}/Gemma_cv.csv (100%) rename fairnessbench_analysis/{ => csv_files}/Granite_cv.csv (100%) diff --git a/.gitignore b/.gitignore index 22b1bca..8945c09 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,8 @@ clean.sh *.json .vscode/ fairnessBench/eval/test/ - +fairnessbench_analysis/*/*.png +fairnessbench_analysis/*/*.csv # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py index 96ffbce..f28d0ba 100644 --- a/fairnessbench_analysis/adult_di_code_llmeval.py +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T08:50:51.905807.csv') +os.chdir('..') + # Removing missing rows fairnessBench code_eval= code_eval.dropna(how="any") @@ -50,4 +54,6 @@ for ax in m.axes.flatten(): plt.setp(ax.get_xticklabels(), rotation=30) ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) + +os.chdir('graphs/') plt.savefig('adult_di_code_llm_eval.png', dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py index bef5c41..e9d43db 100644 --- a/fairnessbench_analysis/adult_fairness.py +++ b/fairnessbench_analysis/adult_fairness.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') +os.chdir('..') + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', @@ -54,5 +58,6 @@ ax[3,i].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) - + +os.chdir('graphs/') plt.savefig('adult_fairness.png',dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py index d77f134..82923ce 100644 --- a/fairnessbench_analysis/balancing_fairness.py +++ b/fairnessbench_analysis/balancing_fairness.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') +os.chdir('..') + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', @@ -54,4 +58,6 @@ elif i in [1, 2, 4, 5]: # other fairness metrics ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +os.chdir('graphs/') plt.savefig('balancing_fairness.png',dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py index 64618f6..05fd588 100644 --- a/fairnessbench_analysis/code_log_llm_eval.py +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,9 +6,11 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') -log_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Results_Final_log_clean2025-08-06T04:22:17.377479.csv') -perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') +log_eval = pd.read_csv('Results_Final_log_clean2025-08-13T10:44:21.146989.csv') +perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -42,5 +45,8 @@ score_cols = ["1. Model Overview", "2. Stakeholder Identification and Fairness Definition","3. Data Collection and Processing","4. Bias Detection and Mitigation","5. Fairness Metric Selection","6. Model Selection and Training","7. Evaluation and Testing"] log_tall = wider_log.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], value_vars=score_cols,var_name='score') + + +os.chdir('graphs/') sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig('logval') diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py index c8c15fa..683f0f9 100644 --- a/fairnessbench_analysis/correlation_flake8_code.py +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') +os.chdir('..') + # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -48,4 +52,6 @@ def flake8_corr_matrix(group): yticklabels=['Flake8 score'] ) plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") + +os.chdir('graphs/') plt.savefig('flake8_vs_code_correlation.png',bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/Deepseek_cv.csv b/fairnessbench_analysis/csv_files/Deepseek_cv.csv similarity index 100% rename from fairnessbench_analysis/Deepseek_cv.csv rename to fairnessbench_analysis/csv_files/Deepseek_cv.csv diff --git a/fairnessbench_analysis/Gemma_cv.csv b/fairnessbench_analysis/csv_files/Gemma_cv.csv similarity index 100% rename from fairnessbench_analysis/Gemma_cv.csv rename to fairnessbench_analysis/csv_files/Gemma_cv.csv diff --git a/fairnessbench_analysis/Granite_cv.csv b/fairnessbench_analysis/csv_files/Granite_cv.csv similarity index 100% rename from fairnessbench_analysis/Granite_cv.csv rename to fairnessbench_analysis/csv_files/Granite_cv.csv diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py index 7d2979b..f759e69 100644 --- a/fairnessbench_analysis/cv_scores_evalmodels.py +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -1,8 +1,10 @@ +import os import pandas as pd -gemma_df = pd.read_csv('../fairnessBench/fairnessbench_analysis/Gemma_cv.csv') -deepseek_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Deepseek_cv.csv') -granite_df=pd.read_csv('../fairnessBench/fairnessbench_analysis/Granite_cv.csv') +os.chdir('csv_files/') +gemma_df = pd.read_csv('Gemma_cv.csv') +deepseek_df= pd.read_csv('Deepseek_cv.csv') +granite_df=pd.read_csv('Granite_cv.csv') gemma_df['eval'] = 'gemma' deepseek_df['eval'] = 'deepseek' @@ -14,4 +16,5 @@ granite_df = granite_df[cols] all_eval_cv = pd.concat([gemma_df, deepseek_df, granite_df], axis=0, ignore_index=True) + all_eval_cv.to_csv('cv_scores_evalmodel.csv',index=False) \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py index 5226b39..4b1be7f 100644 --- a/fairnessbench_analysis/di_across_datasets.py +++ b/fairnessbench_analysis/di_across_datasets.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,10 @@ import seaborn as sns # Loading useful dataframes -perf_alt = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', @@ -37,5 +41,7 @@ for i, ax in enumerate(g.axes.flat): ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + # saving the plot +os.chdir('graphs/') plt.savefig('di_vs_acc_scatter.png', dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py index ee328d5..37a8afa 100644 --- a/fairnessbench_analysis/explode_results.py +++ b/fairnessbench_analysis/explode_results.py @@ -5,7 +5,7 @@ import json # loading the performance results -perf_path = '../results_manually_combined' +perf_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' result_files = [ os.path.join(perf_path, fname) for fname in os.listdir(perf_path) @@ -43,11 +43,14 @@ cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exploded_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] exploded_score = exploded_score[cols] exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) + +os.chdir('csv_files') exploded_score.to_csv('Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') # loading llm eval results -result_path = '../results_final_total' +result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' result_files = [ os.path.join(result_path, fname) for fname in os.listdir(result_path) @@ -98,7 +101,10 @@ on=['model', 'task', 'run_ts'], how='left' ) + +os.chdir('csv_files') exp_code.to_csv('Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') # extracting log llm eval results raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) @@ -115,10 +121,13 @@ cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_log.columns if col not in ['model', 'task', 'run_ts', 'run_id']] exp_log = exp_log[cols] exp_log = exp_log.drop(exp_log.columns[4],axis=1) -#exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) + +os.chdir('csv_files') +exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') # loading baseline results -result_path = '../sanity_results' +result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/sanity_results' result_files = [ os.path.join(result_path, resjson) for resjson in os.listdir(result_path) @@ -151,4 +160,7 @@ exploded_score = exploded_score.rename( columns={col: f'baseline_{col}' for col in cols_to_prefix} ) + +os.chdir('csv_files') exploded_score.to_csv('Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) +os.chdir('..') diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py index c142544..31b40ff 100644 --- a/fairnessbench_analysis/performance_flake8_code.py +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np from datetime import datetime @@ -5,7 +6,11 @@ import seaborn as sns # Loading useful dataframes -code_eval = pd.read_csv('../fairnessBench/fairnessbench_analysis/Result_Final_code_clean2025-08-06T04:22:08.635847.csv') +os.chdir('csv_files') +code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') +os.chdir('..') + + # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -33,6 +38,8 @@ for i, ax in enumerate(g.axes.flat): ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) + +os.chdir('graphs/') plt.savefig('performance_flake8_code.png',dpi=300) diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py index c0117a0..449d0ea 100644 --- a/fairnessbench_analysis/target10_sucess_rate.py +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -1,9 +1,12 @@ +import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns -perf_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') +os.chdir('csv_files') +perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -13,7 +16,9 @@ perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) # loading baseline -baseline_df= pd.read_csv('../fairnessBench/fairnessbench_analysis/Baseline_cleaned_perfomance2025-08-06T06:09:53.582383.csv') +os.chdir('csv_files') +baseline_df= pd.read_csv('Baseline_cleaned_perfomance2025-08-13T10:44:21.444178.csv') +os.chdir('..') base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', @@ -97,5 +102,6 @@ def improvement(s): df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], value_vars=['total','success','improvement'],var_name='count_type',value_name='count') sns.set_context(context='poster',font_scale = .5) +os.chdir('graphs/') sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig('target10_success.png') diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py index 16234b5..2430dcc 100644 --- a/fairnessbench_analysis/target_selection.py +++ b/fairnessbench_analysis/target_selection.py @@ -1,9 +1,14 @@ +import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns -res = pd.read_csv('../fairnessBench/fairnessbench_analysis/Final_step_perfomance2025-08-06T04:21:57.255454.csv') + +os.chdir('csv_files') +res = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') +os.chdir('..') + task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([res, task_data_metric],axis=1) wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] @@ -51,4 +56,6 @@ adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) + +os.chdir('graphs/') plt.savefig("adrec_allmetric.png",dpi=400,bbox_inches='tight') \ No newline at end of file From e352ce6767b902fe4b444dcbb857828648c38696 Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Thu, 18 Sep 2025 03:35:05 +0000 Subject: [PATCH 3/9] New changes to analysis files --- .gitignore | 3 +- .../adult_di_code_llmeval.py | 12 ++++---- fairnessbench_analysis/adult_fairness.py | 11 +++---- fairnessbench_analysis/balancing_fairness.py | 10 +++---- fairnessbench_analysis/code_log_llm_eval.py | 18 ++++++----- .../correlation_flake8_code.py | 10 +++---- .../cv_scores_evalmodels.py | 10 +++---- fairnessbench_analysis/di_across_datasets.py | 11 +++---- fairnessbench_analysis/explode_results.py | 30 +++++++------------ .../performance_flake8_code.py | 10 +++---- .../target10_sucess_rate.py | 15 +++++----- fairnessbench_analysis/target_selection.py | 12 ++++---- 12 files changed, 74 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index 06d144c..2d47b79 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,8 @@ eval*.sh fairnessbench_analysis/*/*.png fairnessbench_analysis/*/*.csv - +# path +path.py # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/fairnessbench_analysis/adult_di_code_llmeval.py b/fairnessbench_analysis/adult_di_code_llmeval.py index f28d0ba..69714c3 100644 --- a/fairnessbench_analysis/adult_di_code_llmeval.py +++ b/fairnessbench_analysis/adult_di_code_llmeval.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns - +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T08:50:51.905807.csv') -os.chdir('..') +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) + # Removing missing rows fairnessBench code_eval= code_eval.dropna(how="any") @@ -55,5 +55,5 @@ plt.setp(ax.get_xticklabels(), rotation=30) ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3) -os.chdir('graphs/') -plt.savefig('adult_di_code_llm_eval.png', dpi=400 , bbox_inches='tight') \ No newline at end of file +output = os.path.join(GRAPHS, 'adult_di_code_llm_eval.png') +plt.savefig(output, dpi=400 , bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py index e9d43db..c104bd1 100644 --- a/fairnessbench_analysis/adult_fairness.py +++ b/fairnessbench_analysis/adult_fairness.py @@ -4,11 +4,12 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') -os.chdir('..') +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) + # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -59,5 +60,5 @@ ax[4,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) ax[5,i].axhline(y=0.0, color='black', linestyle='-.', alpha=0.6) -os.chdir('graphs/') -plt.savefig('adult_fairness.png',dpi=300, bbox_inches='tight') +output = os.path.join(GRAPHS, 'adult_fairness.png') +plt.savefig(output,dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py index 82923ce..b0bcd3a 100644 --- a/fairnessbench_analysis/balancing_fairness.py +++ b/fairnessbench_analysis/balancing_fairness.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T08:50:41.399910.csv') -os.chdir('..') +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -59,5 +59,5 @@ ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) -os.chdir('graphs/') -plt.savefig('balancing_fairness.png',dpi=400,bbox_inches='tight') \ No newline at end of file +output = os.path.join(GRAPHS,'balancing_fairness.png') +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/code_log_llm_eval.py b/fairnessbench_analysis/code_log_llm_eval.py index 05fd588..906635f 100644 --- a/fairnessbench_analysis/code_log_llm_eval.py +++ b/fairnessbench_analysis/code_log_llm_eval.py @@ -4,13 +4,14 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS + # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') -log_eval = pd.read_csv('Results_Final_log_clean2025-08-13T10:44:21.146989.csv') -perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +code_eval = pd.read_csv(CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv') +log_eval = pd.read_csv(CSV_FILES/'Results_Final_log_clean2025-09-18T00:48:52.486398.csv') +perf_df= pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') + # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -31,7 +32,8 @@ score_cols = ["1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"] code_tall = wider_code.melt(id_vars=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id'], value_vars=score_cols,var_name='score') -sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig('codeval') +output= os.path.join(GRAPHS,'codeval') +sns.catplot(code_tall,col='model',row='resrch_prob',x= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) # log eval @@ -47,6 +49,6 @@ value_vars=score_cols,var_name='score') -os.chdir('graphs/') -sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig('logval') +output=os.path.join(GRAPHS,'logval') +sns.catplot(log_tall,col='model',x='resrch_prob',row= 'task_dataset',y='value',hue='score',kind='bar').savefig(output) diff --git a/fairnessbench_analysis/correlation_flake8_code.py b/fairnessbench_analysis/correlation_flake8_code.py index 683f0f9..2b4c237 100644 --- a/fairnessbench_analysis/correlation_flake8_code.py +++ b/fairnessbench_analysis/correlation_flake8_code.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') -os.chdir('..') +file= CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) # Removing missing rows code_eval= code_eval.dropna(how="any") @@ -53,5 +53,5 @@ def flake8_corr_matrix(group): ) plt.title("Flake8 vs Rubric Correlation (claude-3, adult, balance, erd)") -os.chdir('graphs/') -plt.savefig('flake8_vs_code_correlation.png',bbox_inches='tight') \ No newline at end of file +output= os.path.join(GRAPHS,'flake8_vs_code_correlation.png') +plt.savefig(output,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/cv_scores_evalmodels.py b/fairnessbench_analysis/cv_scores_evalmodels.py index f759e69..e7d9cd7 100644 --- a/fairnessbench_analysis/cv_scores_evalmodels.py +++ b/fairnessbench_analysis/cv_scores_evalmodels.py @@ -1,10 +1,10 @@ import os import pandas as pd - -os.chdir('csv_files/') -gemma_df = pd.read_csv('Gemma_cv.csv') -deepseek_df= pd.read_csv('Deepseek_cv.csv') -granite_df=pd.read_csv('Granite_cv.csv') +from path import FILES + +gemma_df = pd.read_csv(FILES/'Gemma_cv.csv') +deepseek_df= pd.read_csv(FILES/'Deepseek_cv.csv') +granite_df=pd.read_csv(FILES/'Granite_cv.csv') gemma_df['eval'] = 'gemma' deepseek_df['eval'] = 'deepseek' diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py index 4b1be7f..adcaeab 100644 --- a/fairnessbench_analysis/di_across_datasets.py +++ b/fairnessbench_analysis/di_across_datasets.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -perf_alt = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +file= CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +perf_alt = pd.read_csv(file) # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -43,5 +43,6 @@ ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) # saving the plot -os.chdir('graphs/') -plt.savefig('di_vs_acc_scatter.png', dpi=300, bbox_inches='tight') \ No newline at end of file +output= os.path.join(GRAPHS,'di_vs_acc_scatter.png') + +plt.savefig(output, dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py index 37a8afa..a8a9285 100644 --- a/fairnessbench_analysis/explode_results.py +++ b/fairnessbench_analysis/explode_results.py @@ -3,9 +3,10 @@ import numpy as np from datetime import datetime import json +from path import PROJECT_ROOT,CSV_FILES # loading the performance results -perf_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' +perf_path = PROJECT_ROOT result_files = [ os.path.join(perf_path, fname) for fname in os.listdir(perf_path) @@ -44,13 +45,10 @@ exploded_score = exploded_score[cols] exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) -os.chdir('csv_files') -exploded_score.to_csv('Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') - - +output_file=os.path.join(CSV_FILES, 'Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) # loading llm eval results -result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/results_final_total' +result_path = PROJECT_ROOT result_files = [ os.path.join(result_path, fname) for fname in os.listdir(result_path) @@ -101,11 +99,8 @@ on=['model', 'task', 'run_ts'], how='left' ) - -os.chdir('csv_files') -exp_code.to_csv('Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') - +output_file=os.path.join(CSV_FILES, 'Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv') +exp_code.to_csv(output_file,index=False) # extracting log llm eval results raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) exp_log= raw_log["raw_scores"].apply(pd.Series).drop(columns = [0]) @@ -122,10 +117,8 @@ exp_log = exp_log[cols] exp_log = exp_log.drop(exp_log.columns[4],axis=1) -os.chdir('csv_files') -exp_log.to_csv('Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') - +output_file=os.path.join(CSV_FILES, 'Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv') +exp_log.to_csv(output_file,index=False) # loading baseline results result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/sanity_results' result_files = [ @@ -161,6 +154,5 @@ columns={col: f'baseline_{col}' for col in cols_to_prefix} ) -os.chdir('csv_files') -exploded_score.to_csv('Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv',index=False) -os.chdir('..') +output_file=os.path.join(CSV_FILES, 'Baseline_cleaned_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exploded_score.to_csv(output_file,index=False) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py index 31b40ff..ec823c3 100644 --- a/fairnessbench_analysis/performance_flake8_code.py +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -4,11 +4,11 @@ from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS # Loading useful dataframes -os.chdir('csv_files') -code_eval = pd.read_csv('Result_Final_code_clean2025-08-13T10:44:12.639136.csv') -os.chdir('..') +file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv' +code_eval = pd.read_csv(file) # Removing missing rows @@ -39,7 +39,7 @@ ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) -os.chdir('graphs/') -plt.savefig('performance_flake8_code.png',dpi=300) +output = os.path.join(GRAPHS,'performance_flake8_code.png') +plt.savefig(output,dpi=300) diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py index 449d0ea..6bd9347 100644 --- a/fairnessbench_analysis/target10_sucess_rate.py +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -3,10 +3,10 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS -os.chdir('csv_files') -perf_df= pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +# Loading useful dataframes +perf_df = pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', @@ -16,9 +16,8 @@ perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) # loading baseline -os.chdir('csv_files') -baseline_df= pd.read_csv('Baseline_cleaned_perfomance2025-08-13T10:44:21.444178.csv') -os.chdir('..') + +baseline_df = pd.read_csv(CSV_FILES/'Baseline_cleaned_perfomance2025-09-18T00:48:53.537033.csv') base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', @@ -102,6 +101,6 @@ def improvement(s): df_improvement_stats_tall = df_improvement_stats.melt(id_vars=['model','task_dataset'], value_vars=['total','success','improvement'],var_name='count_type',value_name='count') sns.set_context(context='poster',font_scale = .5) -os.chdir('graphs/') -sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig('target10_success.png') +output= os.path.join(GRAPHS,'target10_success.png' ) +sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig(output) diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py index 2430dcc..45e7d15 100644 --- a/fairnessbench_analysis/target_selection.py +++ b/fairnessbench_analysis/target_selection.py @@ -3,11 +3,11 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns +from path import CSV_FILES,GRAPHS - -os.chdir('csv_files') -res = pd.read_csv('Final_step_perfomance2025-08-13T10:44:02.216469.csv') -os.chdir('..') +# Loading useful dataframes +file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +res = pd.read_csv(file) task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([res, task_data_metric],axis=1) @@ -57,5 +57,5 @@ adrec_long['task_metric'] = adrec_long['task_metrics'].map(metric_rename) g = sns.catplot(data=adrec_long,kind='bar',x='task_metric',y='task_metric_value',hue='model',col='dem',height=4,aspect=1.5) -os.chdir('graphs/') -plt.savefig("adrec_allmetric.png",dpi=400,bbox_inches='tight') \ No newline at end of file +output= os.path.join(GRAPHS,"adrec_allmetric.png") +plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file From 3d8e3f4a00dafdc30fc776991e18409ac846074b Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Thu, 18 Sep 2025 03:51:04 +0000 Subject: [PATCH 4/9] readme file and removed results from the repo --- .../csv_files/Deepseek_cv.csv | 3 -- fairnessbench_analysis/csv_files/Gemma_cv.csv | 8 ---- .../csv_files/Granite_cv.csv | 3 -- fairnessbench_analysis/readme.md | 48 +++++++++++++++++++ 4 files changed, 48 insertions(+), 14 deletions(-) delete mode 100644 fairnessbench_analysis/csv_files/Deepseek_cv.csv delete mode 100644 fairnessbench_analysis/csv_files/Gemma_cv.csv delete mode 100644 fairnessbench_analysis/csv_files/Granite_cv.csv create mode 100644 fairnessbench_analysis/readme.md diff --git a/fairnessbench_analysis/csv_files/Deepseek_cv.csv b/fairnessbench_analysis/csv_files/Deepseek_cv.csv deleted file mode 100644 index 22fefe5..0000000 --- a/fairnessbench_analysis/csv_files/Deepseek_cv.csv +++ /dev/null @@ -1,3 +0,0 @@ -model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing -claude-3-7-sonnet-20250219,adult_err_balance-race,1.1035347137172349,0.2793492449300636,0.1818753236896995,0.15465752264217267,0.23111601141098256 -qwen,adult_err_balance-race,0.0,0.29632197530102916,0.3853887243714261,0.4880266796833574,0.34656317045517515 diff --git a/fairnessbench_analysis/csv_files/Gemma_cv.csv b/fairnessbench_analysis/csv_files/Gemma_cv.csv deleted file mode 100644 index f9f85a4..0000000 --- a/fairnessbench_analysis/csv_files/Gemma_cv.csv +++ /dev/null @@ -1,8 +0,0 @@ -model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing -claude-3-7-sonnet-20250219,adult_err_balance-race,0.15946509441945983,0.11649810905340587,0.0,0.15946509441945983,0.0 -claude-3-7-sonnet-20250219,adult_err_balance-sex,0.17494570236436235,0.0,0.0,0.1957400731715678,0.0 -gpt-4o,adult_err_balance-race,0.0,0.0,0.0,0.0,0.0 -gpt-4o,adult_err_balance-sex,0.0,0.0,0.0,0.0,0.0 -llama,adult_err_balance-race,0.20573779994945587,0.19716158838352976,0.0,0.12297509238026914,0.0 -llama,adult_err_balance-sex,0.3043212760213842,0.12426253043692712,0.0,0.16886551261045996,0.0 -qwen,adult_err_balance-sex,0.2138089935299395,0.19716158838352973,0.0,0.19716158838352973,0.0 diff --git a/fairnessbench_analysis/csv_files/Granite_cv.csv b/fairnessbench_analysis/csv_files/Granite_cv.csv deleted file mode 100644 index e0023d9..0000000 --- a/fairnessbench_analysis/csv_files/Granite_cv.csv +++ /dev/null @@ -1,3 +0,0 @@ -model,task,1. Data Collection and Processing,2. Bias Detection and Mitigation,3. Fairness Metric Selection,4. Model Selection and Training,5. Evaluation and Testing -claude-3-7-sonnet-20250219,adult_err_balance-race,0.04932502891543654,0.09584211726899525,0.14691056734678462,0.0,0.04844009143018392 -qwen,adult_err_balance-race,0.18672359914948844,1.282654434033444,0.2803402154503214,0.062112999374994156,0.28247912462432095 diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md new file mode 100644 index 0000000..ebbca27 --- /dev/null +++ b/fairnessbench_analysis/readme.md @@ -0,0 +1,48 @@ +# fairnessbench analysis + +This folder contains all the code and data for analyzing the fairnessbench results. +The main analysis script is explode_results.py, which loads the raw results data and creates clean CSV files ready for analysis. + +# A. Setup: + +**Local path configuration** +1. Create `paths.py` at the repo root. +2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. + +### Required variables in `paths.py` +- **PROJECT_ROOT** — Directory that contains all *raw results*. +- **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. +- **GRAPHS** — Directory where analysis scripts will save generated *figures/plots*. +- **FILES** — Directory that stores *CSV files from different evaluation models* (used by `cv_scores_evalmodels.py`). + +# B. Run Analysis: +**Run main file** +```python +python explode_results.py +``` +This will create the following files in the csv_files/ directory: +- Result_Final_code_clean*.csv: File contains raw scores and final scores from the llm evaluation on the training scripts(code). +- Result_Final_log_clean*: File contains raw scores and final scores from the llm evaluation on the reasoning process of the agent(log). +- Final_step_performance*.csv: File contains performance metric(e.g. accuracy,disparate impact etc.) scores of the models on each task. +These files are then used for futher analysis +**Analysis** +In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. +To run an analysis, change the input CSV filename in the script to the file required for that analysis. +**Example:** To analyze different types of fairness for the Adult dataset, run `adult_fairness.py`. Before running it, update the script’s input CSV to the new file generated in the `csv_files/` directory. + +```python +python ....py +``` +***Key files:*** +`- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation.`\ +`- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset.`\ +`- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem.`\ +`- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations.`\ +`- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores.`\ +`- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores.`\ +`- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems.`\ +`- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems.`\ +`- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks.`\ +`- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model.`\ + + From 16cdbe1902f27b0b6a917b9c63403d79f0d62fc0 Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Thu, 18 Sep 2025 03:57:40 +0000 Subject: [PATCH 5/9] font changes to readme --- fairnessbench_analysis/readme.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md index ebbca27..2ca8829 100644 --- a/fairnessbench_analysis/readme.md +++ b/fairnessbench_analysis/readme.md @@ -8,7 +8,7 @@ The main analysis script is explode_results.py, which loads the raw results data **Local path configuration** 1. Create `paths.py` at the repo root. 2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. - +3. `paths.py` is in `.gitignore`. ### Required variables in `paths.py` - **PROJECT_ROOT** — Directory that contains all *raw results*. - **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. @@ -24,7 +24,9 @@ This will create the following files in the csv_files/ directory: - Result_Final_code_clean*.csv: File contains raw scores and final scores from the llm evaluation on the training scripts(code). - Result_Final_log_clean*: File contains raw scores and final scores from the llm evaluation on the reasoning process of the agent(log). - Final_step_performance*.csv: File contains performance metric(e.g. accuracy,disparate impact etc.) scores of the models on each task. -These files are then used for futher analysis + +These files are then used for futher analysis. + **Analysis** In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. To run an analysis, change the input CSV filename in the script to the file required for that analysis. @@ -33,16 +35,16 @@ To run an analysis, change the input CSV filename in the script to the file requ ```python python ....py ``` -***Key files:*** -`- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation.`\ -`- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset.`\ -`- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem.`\ -`- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations.`\ -`- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores.`\ -`- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores.`\ -`- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems.`\ -`- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems.`\ -`- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks.`\ -`- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model.`\ +**Key files:** +- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation. +- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset. +- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. +- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations. +- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores. +- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores. +- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems. +- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems. +- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks. +- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model. From 3760c844cdcfbf30caea27ca63722b8b6f9e757d Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Fri, 20 Feb 2026 18:20:05 +0000 Subject: [PATCH 6/9] New analysis files --- fairnessbench_analysis/adult_fairness.py | 4 +- fairnessbench_analysis/balancing_fairness.py | 76 +++++++++++++- fairnessbench_analysis/di_across_datasets.py | 50 +++++++++- fairnessbench_analysis/explode_results.py | 99 +++++-------------- .../performance_flake8_code.py | 2 +- fairnessbench_analysis/readme.md | 14 +-- .../target10_sucess_rate.py | 16 +-- fairnessbench_analysis/target_selection.py | 4 +- 8 files changed, 167 insertions(+), 98 deletions(-) diff --git a/fairnessbench_analysis/adult_fairness.py b/fairnessbench_analysis/adult_fairness.py index c104bd1..7af2c7e 100644 --- a/fairnessbench_analysis/adult_fairness.py +++ b/fairnessbench_analysis/adult_fairness.py @@ -7,7 +7,7 @@ from path import CSV_FILES,GRAPHS # Loading useful dataframes -file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +file = CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' perf_alt = pd.read_csv(file) @@ -21,7 +21,7 @@ # expanding the task to sub columns task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([perf_alt, task_data_metric],axis=1) -wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] wider_code = wider_code[wider_cols] # Filtering only adult dataset from the dataframe diff --git a/fairnessbench_analysis/balancing_fairness.py b/fairnessbench_analysis/balancing_fairness.py index b0bcd3a..c9dc503 100644 --- a/fairnessbench_analysis/balancing_fairness.py +++ b/fairnessbench_analysis/balancing_fairness.py @@ -7,7 +7,7 @@ from path import CSV_FILES,GRAPHS # Loading useful dataframes -file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +file = CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' perf_alt = pd.read_csv(file) # Removing missing rows @@ -20,12 +20,12 @@ # expanding the task to sub columns task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([perf_alt, task_data_metric],axis=1) -wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] wider_code = wider_code[wider_cols] task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) wider = pd.concat([wider_code, task_task_dem],axis=1) -cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] wider=wider[cols] # Filtering only balance task from the dataframe @@ -60,4 +60,74 @@ ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) output = os.path.join(GRAPHS,'balancing_fairness.png') +plt.savefig(output,dpi=400,bbox_inches='tight') + + +# Filtering only best task from the dataframe +wider_best = wider[wider['rsch_prob']=='best'] +wider_best=wider_best.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_best= wider_best.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_best.loc[:, 'task_metric_value'] = wider_best.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_best,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'balancing_fairness_best.png') +plt.savefig(output,dpi=400,bbox_inches='tight') + + +# Filtering only implicit task from the dataframe +wider_implicit = wider[wider['rsch_prob']=='implicit'] +wider_implicit=wider_implicit.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +wider_implicit= wider_implicit.copy() +metric_map = { + 'acc': 'acc', + 'di': 'di', + 'spd': 'statistical_parity_diff', + 'eod': 'equal_opp_diff', + 'err' : 'error_rate_ratio', + 'erd' : 'error_rate_diff', + 'ford': 'false_omission_rate_diff', +} +wider_implicit.loc[:, 'task_metric_value'] = wider_implicit.apply(lambda row: row[metric_map[row['task_metric']]], axis=1) + +sns.set_context(context='poster',font_scale=1.0) +g=sns.relplot(data=wider_implicit,x='acc',y='task_metric_value',hue='task_dataset',style='dem',row='task_metric',col='model',kind='scatter', + aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +# add horizontal lines at di=1 for the first row and a vertical line for acc=1 +ax=g.axes +for i in range(len(ax)): + for j in range(len(ax[0])): + if i in [0,3]: # first row (di) + ax[i,j].axhline(y=1.0, color='black', linestyle='-.', alpha=0.3) + elif i in [1, 2, 4, 5]: # other fairness metrics + ax[i,j].axhline(y=0.0, color='black', linestyle='-.', alpha=0.3) + ax[i,j].axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) + +output = os.path.join(GRAPHS,'balancing_fairness_implicit.png') plt.savefig(output,dpi=400,bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/di_across_datasets.py b/fairnessbench_analysis/di_across_datasets.py index adcaeab..c15b728 100644 --- a/fairnessbench_analysis/di_across_datasets.py +++ b/fairnessbench_analysis/di_across_datasets.py @@ -7,20 +7,20 @@ from path import CSV_FILES,GRAPHS # Loading useful dataframes -file= CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' perf_alt = pd.read_csv(file) # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] -perf_alt= perf_alt.dropna(subset=perf, how='all') +#perf_alt= perf_alt.dropna(subset=perf, how='all') perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) # expanding the task to sub columns task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([perf_alt, task_data_metric],axis=1) -wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] wider_code = wider_code[wider_cols] # Filtering only DI from the dataframe @@ -40,9 +40,49 @@ # adding horizontal lines at di=1 to each cell plot for i, ax in enumerate(g.axes.flat): ax.axhline(y=1.0, color='black', linestyle='-.', alpha=0.2) - ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) - + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) # saving the plot output= os.path.join(GRAPHS,'di_vs_acc_scatter.png') +plt.savefig(output, dpi=300, bbox_inches='tight') + +# checking SPD vs Acc +wider_spd = wider_code[wider_code['task_metric']=='spd'] +wider_SPD = ( + wider_spd.groupby(['task_dataset','task-dem'])[['statistical_parity_diff','acc']].mean().reset_index() +) +dem_df= wider_spd['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_SPD=pd.concat([wider_spd,dem_df],axis=1) +wider_SPD=wider_SPD.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +# ploting the scatter plot for spd vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_SPD,x='acc',y='statistical_parity_diff',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=0.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +output= os.path.join(GRAPHS,'spd_vs_acc_scatter.png') plt.savefig(output, dpi=300, bbox_inches='tight') + + +# checking EOD vs Acc +wider_eod = wider_code[wider_code['task_metric']=='eod'] +wider_EOD = ( + wider_eod.groupby(['task_dataset','task-dem'])[['equal_opp_diff','acc']].mean().reset_index() +) +dem_df= wider_eod['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'}) +wider_EOD=pd.concat([wider_eod,dem_df],axis=1) +wider_EOD=wider_EOD.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +# ploting the scatter plot for eod vs acc +sns.set_context(context='poster',font_scale=0.8) +g=sns.relplot(data=wider_EOD,x='acc',y='equal_opp_diff',hue='task_dataset',row='resrch_prob',style='dem',col='model',kind='scatter',aspect=1) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') +# adding horizontal lines at di=1 to each cell plot +for i, ax in enumerate(g.axes.flat): + ax.axhline(y=0.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=1.0, color='black', linestyle='-.', alpha=0.2) +# saving the plot +output= os.path.join(GRAPHS,'eod_vs_acc_scatter.png') +plt.savefig(output, dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/explode_results.py b/fairnessbench_analysis/explode_results.py index a8a9285..35fdc81 100644 --- a/fairnessbench_analysis/explode_results.py +++ b/fairnessbench_analysis/explode_results.py @@ -23,17 +23,20 @@ except Exception as e: print(f"Skipping file {rf} due to error: {e}") performance_df = pd.concat(result_list) +# Drop unsuccessful runs - keep only dict type (successful runs) +result_df_successful = performance_df[performance_df['final_score'].apply(lambda x: isinstance(x, dict))] +print(f"Total rows after filtering: {len(result_df_successful)}") +print(f"Rows dropped: {len(performance_df) - len(result_df_successful)}") end_series = lambda s: pd.Series(s[-5:]) -model_run = performance_df['path'].str.split('/').apply(end_series).rename(columns = +model_run = result_df_successful['path'].str.split('/').apply(end_series).rename(columns = {i:c for i,c in enumerate(['model','task','run_ts'])}) model_run['run_id']= model_run.groupby(['model','task']).cumcount() mr_keep = ['model','task','run_ts','run_id'] -# extracting the performance scores for the results to save in a csv file -exploded_score = performance_df['final_score'].apply(pd.Series).reset_index().drop(columns=[0]) -exploded_score['score_count'] = exploded_score.groupby('index').cumcount() +# extracting the final and the steps performance scores for the results to save in a csv file +exploded_score = result_df_successful['final_score'].apply(pd.Series).reset_index() sp = exploded_score['index'].str.split('/').apply(end_series) sp = sp.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) @@ -45,82 +48,33 @@ exploded_score = exploded_score[cols] exploded_score = exploded_score.drop(exploded_score.columns[4],axis=1) -output_file=os.path.join(CSV_FILES, 'Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv') -exploded_score.to_csv(output_file,index=False) -# loading llm eval results -result_path = PROJECT_ROOT -result_files = [ - os.path.join(result_path, fname) - for fname in os.listdir(result_path) - if os.path.isfile(os.path.join(result_path, fname)) -] -result_list = [] -for rf in result_files: - try: - if os.path.getsize(rf) == 0: - print(f"Skipping empty file: {rf}") - continue - df = pd.read_json(rf).T - result_list.append(df) - except Exception as e: - print(f"Skipping file {rf} due to error: {e}") -result_df = pd.concat(result_list) - -end_series = lambda s: pd.Series(s[-5:]) -model_run = result_df['path'].str.split('/').apply(end_series).rename(columns = - {i:c for i,c in enumerate(['model','task','run_ts'])}) - -model_run['run_id']= model_run.groupby(['model','task']).cumcount() -mr_keep = ['model','task','run_ts','run_id'] -# extracting llm code evaluation -raw_df= result_df[["final_llm_score"]].explode('final_llm_score',)['final_llm_score'].apply(pd.Series).reset_index().drop(columns=[0]) -exp_code= raw_df["raw_scores"].apply(pd.Series).drop(columns=[0]) -exp_code = raw_df.join(raw_df["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) -splits = exp_code['index'].str.split('/').apply(end_series) -splits = splits.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) - -exp_code = exp_code.join(splits[['model', 'task', 'run_ts']]) - -exp_code['run_id'] = exp_code.groupby(['model', 'task']).cumcount() - -cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_code.columns if col not in ['model', 'task', 'run_ts', 'run_id']] -exp_code = exp_code[cols] -exp_code = exp_code.drop(exp_code.columns[4],axis=1) - -# adding flake8 results to the code llm eval df -flake8_df = result_df[['path', 'final_flake8_score']].copy() +# adding flake8 results to performnce df +flake8_df = result_df_successful[['path', 'final_flake8_score']].copy() sps = flake8_df['path'].str.split('/').apply(end_series) sps = sps.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) flake8_df = flake8_df.join(sps[['model', 'task', 'run_ts']]) -# merging both dfs -exp_code = exp_code.merge( +exploded_score = exploded_score.merge( flake8_df[['model', 'task', 'run_ts', 'final_flake8_score']], on=['model', 'task', 'run_ts'], how='left' ) -output_file=os.path.join(CSV_FILES, 'Result_Final_code_clean' + datetime.isoformat(datetime.now()) +'.csv') -exp_code.to_csv(output_file,index=False) -# extracting log llm eval results -raw_log= result_df[["final_log_score"]].explode('final_log_score',)['final_log_score'].apply(pd.Series).reset_index().drop(columns = [0]) -exp_log= raw_log["raw_scores"].apply(pd.Series).drop(columns = [0]) -exp_log = raw_log.join(raw_log["raw_scores"].apply(pd.Series)).drop(columns= ['raw_scores', 'justifications', 'subtotals',0]) -exp_log = exp_log.rename(columns={"total_llm_score":"total_log_score"}) -split = exp_log['index'].str.split('/').apply(end_series) -split = split.rename(columns={i: c for i, c in enumerate([ 'model', 'task', 'run_ts'])}) - -exp_log = exp_log.join(split[['model', 'task', 'run_ts']]) - -exp_log['run_id'] = exp_log.groupby(['model', 'task']).cumcount() - -cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_log.columns if col not in ['model', 'task', 'run_ts', 'run_id']] -exp_log = exp_log[cols] -exp_log = exp_log.drop(exp_log.columns[4],axis=1) - -output_file=os.path.join(CSV_FILES, 'Results_Final_log_clean' + datetime.isoformat(datetime.now()) +'.csv') -exp_log.to_csv(output_file,index=False) +# Function to get last 8 runs +def get_last_best8(df): + if len(df) > 8: + return df.sort_values('run_id').tail(8) + else: + return df + +# Keep only last 8 successful runs per task +exp_score_filtered = exploded_score.groupby(['task','model']).apply(get_last_best8).reset_index(drop=True) +print(f"\nFinal rows after keeping last 8 per task: {len(exp_score_filtered)}") + +output_file=os.path.join(CSV_FILES, 'Final_step_perfomance' + datetime.isoformat(datetime.now()) +'.csv') +exp_score_filtered.to_csv(output_file,index=False) + # loading baseline results -result_path = '/project/pi_brownsarahm_uri_edu/ayman_uri/fairnessBench/sanity_results' +result_path = '/scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/baseline_results' result_files = [ os.path.join(result_path, resjson) for resjson in os.listdir(result_path) @@ -136,8 +90,7 @@ model_run['run_id']= model_run.groupby(['task','run_ts']).cumcount() mr_keep = ['task','run_ts','run_id'] - -exploded_score = result_df[['score']].explode('score',)['score'].apply(pd.Series).reset_index().drop(columns = [0]) +exploded_score = result_df['final_score'].apply(pd.Series).reset_index() exploded_score['score_count'] = exploded_score.groupby('index').cumcount() sp = exploded_score['index'].str.split('/').apply(end_series) sp = sp.rename(columns={i: c for i, c in enumerate([ 'task', 'run_ts'])}) diff --git a/fairnessbench_analysis/performance_flake8_code.py b/fairnessbench_analysis/performance_flake8_code.py index ec823c3..b456621 100644 --- a/fairnessbench_analysis/performance_flake8_code.py +++ b/fairnessbench_analysis/performance_flake8_code.py @@ -37,7 +37,7 @@ # add horizontal lines at di=1 to each cell plot for i, ax in enumerate(g.axes.flat): ax.axhline(y=85.0, color='black', linestyle='-.', alpha=0.2) - ax.axvline(x=75.0, color='black', linestyle='-.', alpha=0.2) + ax.axvline(x=85.0, color='black', linestyle='-.', alpha=0.2) output = os.path.join(GRAPHS,'performance_flake8_code.png') plt.savefig(output,dpi=300) diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md index 2ca8829..3ccb1d2 100644 --- a/fairnessbench_analysis/readme.md +++ b/fairnessbench_analysis/readme.md @@ -36,15 +36,17 @@ To run an analysis, change the input CSV filename in the script to the file requ python ....py ``` **Key files:** -- adult_di_code_llmeval.py: Analyzes disparate impact (DI) for the Adult dataset using the LLM code evaluation. - adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset. - balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. -- code_log_llm_eval.py: Generates plots showing how rubric-section scores vary across tasks and datasets for both code and log evaluations. -- correlation_flake8_code.py: Analyzes the correlation between LLM code-evaluation scores and Flake8 (linter-based) scores. -- cv_scores_evalmodels.py: Analyzes the consistency of different evaluation models based on cross-validation (CV) scores. -- di_across_datasets.py: Analyzes disparate impact (DI) and accuracy across datasets and research problems. -- performance_flake8_code.py: Generates a scatter plot of Flake8 performance vs. LLM code-evaluation scores across models, datasets, and research problems. +- di_across_datasets.py: Analyzes fairness (disparate impact (DI) and equal opportunity diff (EOB))and accuracy across datasets and research problems. +- comparing_flake8_bal_be_impli.py: Generates a bar plot of Flake8 performance for 3 research problems ( balance, best, implicit) for different models and datasets. - target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks. - target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model. +- acc_di_tradeoff_heatmap.py: Generates a heatmap that shows the tradeoff between accuracy and fairness (disparate impact) for 3 research problems ( balance, best, implicit) using the Pareto frontier. +- acc_fairness_overlap_heatmap.py: Analyses the variation in datasets (randoadult,sampadult, nondescriptive, health) to the adult dataset. +- run_count.py: Analyzes the number of completed and successful runs for each model and dataset. +- sensitivity_analysis.py: code analyzing the LLM agent's performance and fairness for different versions of the same prompt. +- dollarstreet_analysis.py: codebase for analyzing the performance across income levels. + diff --git a/fairnessbench_analysis/target10_sucess_rate.py b/fairnessbench_analysis/target10_sucess_rate.py index 6bd9347..5c55146 100644 --- a/fairnessbench_analysis/target10_sucess_rate.py +++ b/fairnessbench_analysis/target10_sucess_rate.py @@ -6,23 +6,23 @@ from path import CSV_FILES,GRAPHS # Loading useful dataframes -perf_df = pd.read_csv(CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv') +perf_df = pd.read_csv(CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv') # Removing missing rows perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] -perf_df= perf_df.dropna(subset=perf, how='all') +#perf_df= perf_df.dropna(subset=perf, how='all') perf_df = perf_df.replace([np.inf, -np.inf], np.nan).fillna(0) # loading baseline -baseline_df = pd.read_csv(CSV_FILES/'Baseline_cleaned_perfomance2025-09-18T00:48:53.537033.csv') +baseline_df = pd.read_csv(CSV_FILES/'Baseline_cleaned_perfomance2026-02-07T02:23:19.806782.csv') base= ['baseline_acc','baseline_precision', 'baseline_recall', 'baseline_di', 'baseline_statistical_parity_diff', 'baseline_equal_opp_diff', 'baseline_error_rate_diff', 'baseline_error_rate_ratio', 'baseline_false_omission_rate_diff'] -baseline_df= baseline_df.dropna(subset=base, how='all') +#baseline_df= baseline_df.dropna(subset=base, how='all') baseline_df= baseline_df.fillna(0) baseline_df= baseline_df.drop(columns=['run_ts','run_id','baseline_score_count']) @@ -41,7 +41,7 @@ 'acc','baseline_acc','precision','baseline_precision','recall', 'baseline_recall','di','baseline_di','statistical_parity_diff','baseline_statistical_parity_diff','equal_opp_diff', 'baseline_equal_opp_diff','error_rate_diff','baseline_error_rate_diff','error_rate_ratio','baseline_error_rate_ratio', -'false_omission_rate_diff','baseline_false_omission_rate_diff','score_count'] +'false_omission_rate_diff','baseline_false_omission_rate_diff'] clean_df=clean_df[columns] # filtering target10 task @@ -102,5 +102,9 @@ def improvement(s): value_vars=['total','success','improvement'],var_name='count_type',value_name='count') sns.set_context(context='poster',font_scale = .5) output= os.path.join(GRAPHS,'target10_success.png' ) -sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar').savefig(output) +g=sns.catplot(df_improvement_stats_tall, col = 'model',x='task_dataset', y='count',hue='count_type',kind='bar') +# Rotate x-axis labels +for ax in g.axes.flatten(): + ax.tick_params(axis='x', labelrotation=35) + g.savefig(output) diff --git a/fairnessbench_analysis/target_selection.py b/fairnessbench_analysis/target_selection.py index 45e7d15..9e22e35 100644 --- a/fairnessbench_analysis/target_selection.py +++ b/fairnessbench_analysis/target_selection.py @@ -6,12 +6,12 @@ from path import CSV_FILES,GRAPHS # Loading useful dataframes -file = CSV_FILES/'Final_step_perfomance2025-09-18T00:48:26.025263.csv' +file = CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' res = pd.read_csv(file) task_data_metric = res['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) wider_code = pd.concat([res, task_data_metric],axis=1) -wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','score_count'] +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] wider_code = wider_code[wider_cols] wider_adrecon = wider_code[wider_code['task_dataset']=='adrecon'].reset_index(drop=True) From 36df8243c3aeaa7137dcf56a603814f690c5dffa Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Fri, 20 Feb 2026 18:23:14 +0000 Subject: [PATCH 7/9] new analysis files --- .../acc_di_tradeoff_heatmap.py | 129 +++++++++++ .../acc_fairness_overlap_heatmap.py | 155 +++++++++++++ .../comparing_flake8_bal_be_impli.py | 48 ++++ .../dollarstreet_analysis.py | 61 +++++ fairnessbench_analysis/run_count.py | 66 ++++++ .../sensitivity_analysis.py | 216 ++++++++++++++++++ 6 files changed, 675 insertions(+) create mode 100644 fairnessbench_analysis/acc_di_tradeoff_heatmap.py create mode 100644 fairnessbench_analysis/acc_fairness_overlap_heatmap.py create mode 100644 fairnessbench_analysis/comparing_flake8_bal_be_impli.py create mode 100644 fairnessbench_analysis/dollarstreet_analysis.py create mode 100644 fairnessbench_analysis/run_count.py create mode 100644 fairnessbench_analysis/sensitivity_analysis.py diff --git a/fairnessbench_analysis/acc_di_tradeoff_heatmap.py b/fairnessbench_analysis/acc_di_tradeoff_heatmap.py new file mode 100644 index 0000000..e116652 --- /dev/null +++ b/fairnessbench_analysis/acc_di_tradeoff_heatmap.py @@ -0,0 +1,129 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff','final_flake8_score'] +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider=wider[cols] +wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob'].isin(["balance", "implicit", "best"])].copy() + +# Adding fairness column +df = wider_balance.copy() +df["fair"] = 1 - (df['di'] - 1).abs() +df["fair"] = df["fair"].clip(lower=0) + +def pareto_max(df, x, y): + """ + Return Pareto optimal points (maximizing both x and y). + """ + data = df[[x, y]].to_numpy() + keep = np.ones(len(df), dtype=bool) + + for i in range(len(df)): + # point j dominates i if: + # j is >= in both AND > in at least one + dominates = np.all(data >= data[i], axis=1) & np.any(data > data[i], axis=1) + dominates[i] = False + if np.any(dominates): + keep[i] = False + + return df[keep] + +pareto_all = [] + +# loop through each panel and research problem +for (dataset, model, prob), g in df.groupby(['dataset','model', 'rsch_prob']): + front = pareto_max(g, 'acc', "fair") + + # radius and angle for Pareto points + front = front.copy() + front["r"] = np.sqrt(front['acc']**2 + front["fair"]**2) + front["theta"] = np.arctan2(front["fair"], front['acc']) # radians + + pareto_all.append(front) + +pareto_df = pd.concat(pareto_all, ignore_index=True) + +r_summary = pareto_df.groupby(['dataset','model', 'rsch_prob'])["r"].mean().reset_index() +r_summary = r_summary.rename(columns={"r": "r_mean"}) + +# Function to calculate circular mean +def circ_mean(theta): + return np.arctan2(np.mean(np.sin(theta)), np.mean(np.cos(theta))) + +theta_summary = ( + pareto_df.groupby(['dataset','model', 'rsch_prob'])["theta"] + .apply(circ_mean) + .reset_index() + .rename(columns={"theta": "theta_mean"}) +) + +summary = r_summary.merge(theta_summary, on=['dataset','model', 'rsch_prob']) +summary["theta_mean_deg"] = np.degrees(summary["theta_mean"]) + +angle_ranges = summary.groupby("rsch_prob")["theta_mean_deg"].agg(["min","max"]) + +# Function to calculate the overlap between two ranges +def range_overlap(a_min, a_max, b_min, b_max): + overlap = max(0, min(a_max, b_max) - max(a_min, b_min)) + total = max(a_max, b_max) - min(a_min, b_min) + return overlap / total if total > 0 else 0 + +pairs = [("balance","implicit"), + ("balance","best"), + ("implicit","best")] + + +# centering +summary["theta_centered"] = summary["theta_mean_deg"] - 45 + +sns.set_context(context='poster',font_scale= 0.75) +cmap = sns.diverging_palette(145, 300, as_cmap=True) +summary["pm"] = summary["rsch_prob"].astype(str) + "-" + summary["model"].astype(str) +pivot = summary.pivot( + index="pm", + columns="dataset", + values="theta_centered" +) + + +plt.figure(figsize=(12,8)) + +sns.heatmap( + pivot, + cmap=cmap, + center=0, + vmin=-45, + vmax=45, + annot=True, + fmt=".1f" + ) + + +plt.xlabel("Dataset") +plt.ylabel("Research Problem | Model") +plt.tight_layout() +output= os.path.join(GRAPHS,'acc_di_tradeoff_heatmap.png') +plt.savefig(output, dpi=300, bbox_inches='tight') diff --git a/fairnessbench_analysis/acc_fairness_overlap_heatmap.py b/fairnessbench_analysis/acc_fairness_overlap_heatmap.py new file mode 100644 index 0000000..f8dcd16 --- /dev/null +++ b/fairnessbench_analysis/acc_fairness_overlap_heatmap.py @@ -0,0 +1,155 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) + +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff','final_flake8_score'] + +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider=wider[cols] +wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# summary df of the mean and std +summary = ( + wider.groupby(["model", "task_dataset", "rsch_prob"]) + .agg( + mean_acc=("acc","mean"), + mean_di=("di","mean"), + std_acc=("acc","std"), + std_di=("di","std") + ) + .reset_index() +) + +allowed_datasets = ["randoadult","sampadult","nondescriptive",'health'] + +results = [] + +# here we loop over each (model, research problem) group +for (model, rp), group in summary.groupby(["model","rsch_prob"]): + + # find baseline (adult) + base_row = group[group["task_dataset"]=="adult"] +#if a model+research-problem doesn’t have an Adult row, I can’t compute diffs, so skip it + if base_row.empty: + continue +# extracting adults mean n std , .iloc[0] grabs the first row value (assumes only one Adult row exists in this group). + base_di = base_row["mean_di"].iloc[0] + base_acc = base_row["mean_acc"].iloc[0] + # build baseline DI interval using variance + base_min_di = base_di - base_row["std_di"].iloc[0] + base_max_di = base_di + base_row['std_di'].iloc[0] + # build baseline ACC interval using std + base_min_acc = base_acc - base_row["std_acc"].iloc[0] + base_max_acc = base_acc + base_row['std_acc'].iloc[0] +# iterate through each dataset std (adult, randoadult, sampadult, …) for this model+rproblem + for _, row in group.iterrows(): + # get dataset name for this row + dataset = row["task_dataset"] + # skip baseline itself, don't compare adult to itself + if dataset == "adult": + continue + if dataset not in allowed_datasets: + continue + + change_type = "data_change" + if row["task_dataset"] == "nondescriptive": + change_type = "context_change" + row_max_di = (row["mean_di"] + row['std_di']) + row_min_di = (row["mean_di"] - row['std_di']) + + row_max_acc = (row["mean_acc"] + row['std_acc']) + row_min_acc = (row["mean_acc"] - row['std_acc']) + results.append({ + "model": model, + "rsch_prob": rp, + "baseline_dataset": "adult", + "comparison_dataset": dataset, + # Differences in mean fairness and accuracy, these measure how far the means move from Adult baseline. + "fairness_diff": abs(base_di - row["mean_di"]), + "accuracy_diff": abs(base_acc - row["mean_acc"]), + 'min_fair_dff':row_min_di, + 'max_fair_diff':row_max_di, + 'min_acc_diff':row_min_acc, + 'max_acc_diff':row_max_acc, + # compute overlap between baseline interval and comparison interval + # overlap = 0 intervals are disjoint (stronger evidence of change) + # overlap large intervals similar/overlapping (weaker evidence of change) + 'overlap_fair': max(0,min(base_max_di,row_max_di)-max(base_min_di,row_min_di)), # width + # Baseline interval length + 'len_man_min': base_max_di - base_min_di, + 'overlap_acc': max(0,min(base_max_acc,row_max_acc)-max(base_min_acc,row_min_acc)), # width + 'len_acc_minmax':base_max_acc - base_min_acc, + "change_type": change_type + }) + + +sens_df = pd.DataFrame(results) +sens_df['final_overlap_fair'] = sens_df['overlap_fair'] / sens_df['len_man_min'] +sens_df['final_overlap_acc'] = sens_df['overlap_acc'] / sens_df['len_acc_minmax'] + +# heatmap for fairness_overlap +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_dataset", + values="final_overlap_fair", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) + +plt.title("Fairness Sensitivity Across Dataset Variants") +plt.xlabel("Comparison Dataset") +plt.ylabel("Research Problem | Model") + +output= os.path.join(GRAPHS,'fairness_overlap_heatmap.png') +plt.savefig(output, dpi=300, bbox_inches='tight') + +# heatmap for acc_overlap +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_dataset", + values="final_overlap_acc", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) + +plt.title("Accuracy Sensitivity Across Dataset Variants") +plt.xlabel("Comparison Dataset") +plt.ylabel("Research Problem | Model") +output= os.path.join(GRAPHS,'acc_overlap_heatmap.png') +plt.savefig(output, dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/comparing_flake8_bal_be_impli.py b/fairnessbench_analysis/comparing_flake8_bal_be_impli.py new file mode 100644 index 0000000..01310a1 --- /dev/null +++ b/fairnessbench_analysis/comparing_flake8_bal_be_impli.py @@ -0,0 +1,48 @@ +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns +from path import CSV_FILES,GRAPHS + + +file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv' +perf_alt = pd.read_csv(file) +perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff', + 'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', + 'false_omission_rate_diff','final_flake8_score'] +perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0) + +# expanding the task to sub columns +task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'dataset',1:'task_metric',2:'task-dem'}) +wider_code = pd.concat([perf_alt, task_data_metric],axis=1) +wider_cols=['model','task','dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider_code = wider_code[wider_cols] + +task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'}) +wider = pd.concat([wider_code, task_task_dem],axis=1) +cols= ['model','task','dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score'] +wider=wider[cols] +wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') + +# Filtering only balance task from the dataframe +wider_balance = wider[wider['rsch_prob'].isin(["balance", "implicit", "best"])].copy() +sns.set_context(context='poster',font_scale= 0.75) +g=sns.catplot( + data=wider_balance, + x="rsch_prob", + y="final_flake8_score", + col="model", + row="dataset", + kind="bar", + height=4, + aspect=1 +) +g.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}') + +for ax in g.axes.flat: + ax.axhline(y=85, color='black', linestyle='--') + +output= os.path.join(GRAPHS,'comparing_flake8_bal_be_impli.png') +plt.savefig(output, dpi=300, bbox_inches='tight') \ No newline at end of file diff --git a/fairnessbench_analysis/dollarstreet_analysis.py b/fairnessbench_analysis/dollarstreet_analysis.py new file mode 100644 index 0000000..a31d9b7 --- /dev/null +++ b/fairnessbench_analysis/dollarstreet_analysis.py @@ -0,0 +1,61 @@ + +import pandas as pd + + +df_final = pd.read_csv("Final_step_perfomance2026-02-07T02:23:17.201613.csv") +df_final.head() + + +df_baseline = pd.read_csv("Baseline_cleaned_perfomance2026-02-07T02:23:19.806782.csv") +df_baseline.head(2) + +df_dollar_baseline = df_baseline[df_baseline['baseline_Advantaged'].notna() & df_baseline['baseline_Disadvantaged'].notna()] + +cols_to_keep = ['task', 'run_ts', 'run_id','baseline_Advantaged', 'baseline_Disadvantaged'] + +df_dollar_baseline = df_dollar_baseline[cols_to_keep] +df_dollar_baseline + + +df_dollar_baseline['model'] = 'baseline' +df_dollar_baseline + + +df_final.columns + + +cols_to_keep = ['model', 'task', 'run_ts', 'run_id','Advantaged', 'Disadvantaged'] + + +df_dollar_res = df_final[df_final['Advantaged'].notna() & df_final['Disadvantaged'].notna()] + + +df_dollar_res = df_dollar_res[cols_to_keep] +df_dollar_res.head(2) + + +df_dollar_res['model'].value_counts() + + +df_dollar_baseline = df_dollar_baseline.rename(columns={"baseline_Advantaged": "Advantaged", "baseline_Disadvantaged": "Disadvantaged"}) + + +df_combined = pd.concat([df_dollar_baseline, df_dollar_res], ignore_index=True) +df_combined.head(2) + +df_combined['model'] = df_combined['model'].replace('claude-3-7-sonnet-20250219', 'claude-3-7-sonnet') +df_combined + + +df_avg = (df_combined.groupby('model').agg(avg_adv_acc=("Advantaged", "mean"), +avg_disadv_acc=("Disadvantaged", "mean"), std_adv_acc=("Advantaged", "std"), +std_disadv_acc=("Disadvantaged", "std"), n_runs=("Advantaged", "count")).reset_index()) +df_avg + + +df_avg['disparity']= df_avg['avg_adv_acc']- df_avg['avg_disadv_acc'] +df_avg + + + + diff --git a/fairnessbench_analysis/run_count.py b/fairnessbench_analysis/run_count.py new file mode 100644 index 0000000..e20d7e3 --- /dev/null +++ b/fairnessbench_analysis/run_count.py @@ -0,0 +1,66 @@ +import pandas as pd +import os +from datetime import datetime + +# Read and combine all result files +result_path = '/scratch3/workspace/ayman_sandouk_uri_edu-fairness/fairnessBench/final_results' +result_files = [ + os.path.join(result_path, resjson) + for resjson in os.listdir(result_path) + if os.path.isfile(os.path.join(result_path, resjson)) +] +result_list = [pd.read_json(rf).T for rf in result_files] +result_df = pd.concat(result_list) +print(f"Total rows before filtering: {len(result_df)}") + +end_series = lambda s: pd.Series(s[-5:]) +sp = result_df['path'].str.split('/').apply(end_series) +sp = sp.rename(columns={i: c for i, c in enumerate(['model', 'task', 'run_ts'])}) +exp_score = result_df.join(sp[['model', 'task', 'run_ts']]) +exp_score['run_id'] = exp_score.groupby(['model', 'task']).cumcount() + +cols = ['model', 'task', 'run_ts','run_id'] + [col for col in exp_score.columns if col not in ['model', 'task', 'run_ts', 'run_id']] +exp_score = exp_score[cols] + + +no_final = ~exp_score["final_score"].apply(lambda x: isinstance(x, dict)) +has_final = ~no_final +no_err = exp_score["error"] == "" +time = exp_score["total_time"] > 0 + +# aggregate counts per model/task +summary = ( + exp_score + .assign( + has_final_score = has_final, + time_no_error = (time & no_err) + ) + .groupby(["model", "task"]) + .agg( + runs=("run_ts", "count"), + sucessful_runs=("has_final_score", "sum"), + completed_runs =('time_no_error','sum') + ) + .reset_index() +) +task_decomp = summary['task'].str.split('_').apply(pd.Series).rename( + columns={i:col for i,col in enumerate(['dataset','target_metric','task-dem'])}) +task_dem = task_decomp['task-dem'].str.split('-').apply(pd.Series).rename( + columns={i:col for i,col in enumerate(['research_problem','dem'])}) + +df = pd.concat([summary,task_decomp,task_dem],axis=1) +df +new_df = ( + df.groupby(['model','research_problem'])[ + ['runs','completed_runs','sucessful_runs'] + ] + .sum() + .reset_index() +) + +new_df['success_rate'] = new_df['sucessful_runs'] / new_df['runs'] +#new_df['completion_rate'] = new_df['completed_runs'] / new_df['runs'] +print(new_df.to_latex()) + +m = new_df.groupby(['model','research_problem'])['success_rate'].mean().unstack() +print(m.to_latex()) diff --git a/fairnessbench_analysis/sensitivity_analysis.py b/fairnessbench_analysis/sensitivity_analysis.py new file mode 100644 index 0000000..92f24ab --- /dev/null +++ b/fairnessbench_analysis/sensitivity_analysis.py @@ -0,0 +1,216 @@ + +import os +import pandas as pd +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +import seaborn as sns + + +df_final = pd.read_csv("Final_step_perfomance2026-02-07T02:23:17.201613.csv") +#df_baseline = pd.read_csv("Baseline_cleaned_perfomance2026-02-07T02:23:19.806782.csv") + + +df_final.columns + + +df_final.head(2) + + +#df_baseline.head(2) + + +cols_to_keep = ['model', 'task', 'run_ts', 'run_id', 'acc', 'precision', 'recall', 'di', 'statistical_parity_diff', +'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio', 'false_omission_rate_diff'] + + +df_final = df_final[cols_to_keep] +df_final.head(2) + +list_of_values = ['adult_balance-eod-sex', 'adult_eod_balance-sex', 'adult_balance-eod-nosuccess-sex', +'adult_balance-eod-shortgoal-sex', 'adult_balance-eod-noreq-sex', 'adult_balance-eod-nochange-sex', 'adult_balance-eod-nohow-sex', +'adult_balance-eod-rephrased01-sex', 'adult_balance-eod-rephrased10-sex', 'adult_balance-eod-rephrased06-sex', +'adult_balance-eod-rephrased05-sex', 'adult_balance-eod-rephrased03-sex', 'adult_balance-eod-altmetricdetail-sex', +'adult_balance-eod-altmetricdetail2-sex'] + +filtered_df = df_final[df_final['task'].isin(list_of_values)] +filtered_df + + +filtered_df['task']=filtered_df['task'].replace('adult_eod_balance-sex', 'adult_balance-eod-original-sex') + + +filtered_df['task'].value_counts() + + +data_task = filtered_df['task'].str.split('_').apply(pd.Series).rename(columns={0: 'data', 1: 'task_info'}) +data_task + + +data_task2 = data_task['task_info'].str.split('-').apply(pd.Series).rename(columns={0:'rp', 1:'f_metric', 2:'prompt_variation',3:'dem'}) +data_task2 + + +df_wide = pd.concat([filtered_df, data_task, data_task2],axis=1) +cols=['model','data','rp','f_metric','prompt_variation','dem', 'run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff'] +df_wide = df_wide[cols] +df_wide=df_wide.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet') +df_wide.head() + + + +def get_metric(df, model, rp, data, f_metric, prompt_variation): + return df[ + (df["model"]==model) & + (df["rsch_prob"]==rp) & + (df["task_dataset"]==data)& + (df["prompt_variation"]==prompt_variation) + ][metric].values + + +df_wide['prompt_variation'].value_counts() + + +count = df_wide.groupby(['model', 'prompt_variation'])["rp"].count() + + +count.to_csv("counts.csv") + + +df_wide = df_wide[df_wide["prompt_variation"] != "noreq"] + + +df_wide['prompt_variation'].value_counts() + + +df_wide["prompt_variation"] = df_wide["prompt_variation"].replace({ + 'altmetricdetail': 'altmetricnames', + 'altmetricdetail2': 'informalgoal', + 'rephrased01': 'informaldirect', + 'rephrased03': 'altnowork', + 'rephrased05': 'verbosedetail', + 'rephrased06': 'informationalpassive', + 'rephrased10': 'passivedata'}) + + +df_wide['prompt_variation'].unique() + + +summary = ( + df_wide.groupby(["model", "rp", "prompt_variation"]) + .agg( + mean_acc=("acc","mean"), + mean_di=("di","mean"), + std_acc=("acc","std"), + std_di=("di","std") + ) + .reset_index() +) +summary + +allowed_prompts = ['altmetricnames', 'informalgoal', 'nochange', 'nohow', 'nosuccess', 'informaldirect', 'altnowork', +'verbosedetail', 'informationalpassive', 'passivedata', 'shortgoal'] + +results = [] + +for (model, rp), group in summary.groupby(["model", "rp"]): + + # baseline = original prompt + base_row = group[group["prompt_variation"] == "original"] + + if base_row.empty: + continue + + base_row = base_row.iloc[0] + + # baseline interval + base_min_di = base_row["mean_di"] - base_row["std_di"] + base_max_di = base_row["mean_di"] + base_row["std_di"] + base_min_acc = base_row["mean_acc"] - base_row["std_acc"] + base_max_acc = base_row["mean_acc"] + base_row["std_acc"] + + for _, row in group.iterrows(): + + prompt = row["prompt_variation"] + + if prompt == "original": + continue + + if prompt not in allowed_prompts: + continue + + # variation interval + row_min_di = row["mean_di"] - row["std_di"] + row_max_di = row["mean_di"] + row["std_di"] + row_min_acc = row["mean_acc"] - row["std_acc"] + row_max_acc = row["mean_acc"] + row["std_acc"] + + results.append({ + "model": model, + "rsch_prob": rp, + "baseline_prompt": "original", + "comparison_prompt": prompt, + # mean shifts + "fairness_diff": abs(base_row["mean_di"] - row["mean_di"]), + "accuracy_diff": abs(base_row["mean_acc"] - row["mean_acc"]), + # overlap between two intervals + "overlap_fair": max(0, min(base_max_di, row_max_di) - max(base_min_di, row_min_di)), + "overlap_acc": max(0, min(base_max_acc, row_max_acc) - max(base_min_acc, row_min_acc)), + # baseline interval length + "len_fair_base": base_max_di - base_min_di, + "len_acc_base": base_max_acc - base_min_acc}) + +sens_df = pd.DataFrame(results) + + +sens_df['comparison_prompt'].unique() + + +sens_df['final_overlap_fair'] = sens_df['overlap_fair'] / sens_df['len_fair_base'] +sens_df['final_overlap_acc'] = sens_df['overlap_acc'] / sens_df['len_acc_base'] +sens_df + + +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_prompt", + values="final_overlap_fair", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) +plt.title("Fairness Sensitivity Across Prompt Variations") +plt.xlabel("Comparison Prompts") +plt.ylabel("Research Problem | Model") + +plt.savefig("prompt_sensitivity_fair_overlap.png", dpi=200, bbox_inches="tight") + + +heatmap_data = sens_df.pivot_table( + index=["rsch_prob", "model"], + columns="comparison_prompt", + values="final_overlap_acc", + aggfunc="mean" +).sort_index(level=["rsch_prob", "model"]) + +plt.figure(figsize=(12,6)) + +sns.heatmap( + heatmap_data, + annot=True, + cmap="viridis", + linewidths=0.5 +) + +plt.title("Accuracy Sensitivity Across Prompt Variants") +plt.xlabel("Comparison Prompts") +plt.ylabel("Research Problem | Model") +plt.savefig("prompt_sensitivity_acc_overlap.png", dpi=200, bbox_inches="tight") From 3f7d6044a1befc3be195dbf2ffc85e49ea5e18fe Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Sun, 1 Mar 2026 02:01:29 +0000 Subject: [PATCH 8/9] new changes to readme --- fairnessbench_analysis/readme.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md index 3ccb1d2..bd40987 100644 --- a/fairnessbench_analysis/readme.md +++ b/fairnessbench_analysis/readme.md @@ -36,17 +36,17 @@ To run an analysis, change the input CSV filename in the script to the file requ python ....py ``` **Key files:** -- adult_fairness.py: Analyze the fairness metrics used in the benchmark for the Adult dataset. -- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. -- di_across_datasets.py: Analyzes fairness (disparate impact (DI) and equal opportunity diff (EOB))and accuracy across datasets and research problems. -- comparing_flake8_bal_be_impli.py: Generates a bar plot of Flake8 performance for 3 research problems ( balance, best, implicit) for different models and datasets. -- target_selection.py: Analyze the models performance and fairness metrics for the targey selection tasks. -- target10_sucess_rate.py: Generates a plot showing how agent performance differs from the baseline on the Target10 research problem, by dataset and model. -- acc_di_tradeoff_heatmap.py: Generates a heatmap that shows the tradeoff between accuracy and fairness (disparate impact) for 3 research problems ( balance, best, implicit) using the Pareto frontier. -- acc_fairness_overlap_heatmap.py: Analyses the variation in datasets (randoadult,sampadult, nondescriptive, health) to the adult dataset. -- run_count.py: Analyzes the number of completed and successful runs for each model and dataset. -- sensitivity_analysis.py: code analyzing the LLM agent's performance and fairness for different versions of the same prompt. -- dollarstreet_analysis.py: codebase for analyzing the performance across income levels. +- adult_fairness.py: Analyze the target fairness metrics used in the benchmark for the Adult dataset. It generates Figure 4 in section 4.2.2 +- balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. This file generates Figure 6 in Section 4.3. +- di_across_datasets.py: Analyzes fairness (disparate impact (DI) and equal opportunity diff (EOB))and accuracy across datasets and research problems. It generates Figure 3 found in section 4.2.1. and figure 13, which is found in the appendix. +- comparing_flake8_bal_be_impli.py: Generates a bar plot (which is explained in section 4.3) of Flake8 performance for 3 research problems ( balance, best, implicit) for different models and datasets. +- target_selection.py: Analyze the model's performance and fairness metrics for the target selection tasks. It generates Figure 9 in section 4.5.1 +- target10_sucess_rate.py: Generates figure 5 in section 4.2.3 which shows how agent performance differs from the baseline on the Target10 research problem, by dataset and model. +- acc_di_tradeoff_heatmap.py: Generates a heatmap(Figure 7, section 4.3) that shows the tradeoff between accuracy and fairness (disparate impact) for 3 research problems ( balance, best, implicit) using the Pareto frontier. +- acc_fairness_overlap_heatmap.py: Analyses the variation in datasets (randoadult,sampadult, nondescriptive, health) to the adult dataset. It generates Figures 8 and 12 in section 4.4 +- run_count.py: Analyzes the number of completed and successful runs for each model and dataset. It generates table 4 and 10 in appendix +- sensitivity_analysis.py: code analyzing the LLM agent's performance and fairness for different versions of the same prompt. This file genaerates figure 10 and 11 in section 4.6. +- dollarstreet_analysis.py: codebase for analyzing the performance across income levels which is explained in Section 4.5.2 From 11375f500f7321b4fa8426767423267b12c978c5 Mon Sep 17 00:00:00 2001 From: Ritta Neg mfa Date: Wed, 15 Apr 2026 16:53:37 +0000 Subject: [PATCH 9/9] changes to readme --- fairnessbench_analysis/readme.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fairnessbench_analysis/readme.md b/fairnessbench_analysis/readme.md index bd40987..4ea8652 100644 --- a/fairnessbench_analysis/readme.md +++ b/fairnessbench_analysis/readme.md @@ -10,6 +10,7 @@ The main analysis script is explode_results.py, which loads the raw results data 2. Create the csv_files and graphs directories, then set CSV_FILES and GRAPHS in paths.py to the absolute paths on your machine. 3. `paths.py` is in `.gitignore`. ### Required variables in `paths.py` +Raw results are the original JSON outputs generated by running agents on benchmark tasks, containing performance metrics, fairness scores, and Flake8 scores before any processing or analysis. - **PROJECT_ROOT** — Directory that contains all *raw results*. - **CSV_FILES** — Directory that contains the *clean CSV files* produced by `explode_results.py`. - **GRAPHS** — Directory where analysis scripts will save generated *figures/plots*. @@ -27,7 +28,7 @@ This will create the following files in the csv_files/ directory: These files are then used for futher analysis. -**Analysis** +## Analysis: In FairnessBench we run several analyses on our results. Each `.py` file performs a different analysis and generates plots that are stored in the `graphs/` directory. To run an analysis, change the input CSV filename in the script to the file required for that analysis. **Example:** To analyze different types of fairness for the Adult dataset, run `adult_fairness.py`. Before running it, update the script’s input CSV to the new file generated in the `csv_files/` directory. @@ -35,7 +36,7 @@ To run an analysis, change the input CSV filename in the script to the file requ ```python python ....py ``` -**Key files:** +## Key files: - adult_fairness.py: Analyze the target fairness metrics used in the benchmark for the Adult dataset. It generates Figure 4 in section 4.2.2 - balancing_fairness.py: Analyzes the fairness metrics used in the benchmark for the Balance research problem. This file generates Figure 6 in Section 4.3. - di_across_datasets.py: Analyzes fairness (disparate impact (DI) and equal opportunity diff (EOB))and accuracy across datasets and research problems. It generates Figure 3 found in section 4.2.1. and figure 13, which is found in the appendix.