Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ Run_scripts/*
run_experiments_*.sh
test*.txt
eval*.sh
fairnessbench_analysis/*/*.png
fairnessbench_analysis/*/*.csv




# path
path.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
129 changes: 129 additions & 0 deletions fairnessbench_analysis/acc_di_tradeoff_heatmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS


file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv'
perf_alt = pd.read_csv(file)
perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff',
'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio',
'false_omission_rate_diff','final_flake8_score']
perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0)

# expanding the task to sub columns
task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'dataset',1:'task_metric',2:'task-dem'})
wider_code = pd.concat([perf_alt, task_data_metric],axis=1)
wider_cols=['model','task','dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score']
wider_code = wider_code[wider_cols]

task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'})
wider = pd.concat([wider_code, task_task_dem],axis=1)
cols= ['model','task','dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score']
wider=wider[cols]
wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet')

# Filtering only balance task from the dataframe
wider_balance = wider[wider['rsch_prob'].isin(["balance", "implicit", "best"])].copy()

# Adding fairness column
df = wider_balance.copy()
df["fair"] = 1 - (df['di'] - 1).abs()
df["fair"] = df["fair"].clip(lower=0)

def pareto_max(df, x, y):
"""
Return Pareto optimal points (maximizing both x and y).
"""
data = df[[x, y]].to_numpy()
keep = np.ones(len(df), dtype=bool)

for i in range(len(df)):
# point j dominates i if:
# j is >= in both AND > in at least one
dominates = np.all(data >= data[i], axis=1) & np.any(data > data[i], axis=1)
dominates[i] = False
if np.any(dominates):
keep[i] = False

return df[keep]

pareto_all = []

# loop through each panel and research problem
for (dataset, model, prob), g in df.groupby(['dataset','model', 'rsch_prob']):
front = pareto_max(g, 'acc', "fair")

# radius and angle for Pareto points
front = front.copy()
front["r"] = np.sqrt(front['acc']**2 + front["fair"]**2)
front["theta"] = np.arctan2(front["fair"], front['acc']) # radians

pareto_all.append(front)

pareto_df = pd.concat(pareto_all, ignore_index=True)

r_summary = pareto_df.groupby(['dataset','model', 'rsch_prob'])["r"].mean().reset_index()
r_summary = r_summary.rename(columns={"r": "r_mean"})

# Function to calculate circular mean
def circ_mean(theta):
return np.arctan2(np.mean(np.sin(theta)), np.mean(np.cos(theta)))

theta_summary = (
pareto_df.groupby(['dataset','model', 'rsch_prob'])["theta"]
.apply(circ_mean)
.reset_index()
.rename(columns={"theta": "theta_mean"})
)

summary = r_summary.merge(theta_summary, on=['dataset','model', 'rsch_prob'])
summary["theta_mean_deg"] = np.degrees(summary["theta_mean"])

angle_ranges = summary.groupby("rsch_prob")["theta_mean_deg"].agg(["min","max"])

# Function to calculate the overlap between two ranges
def range_overlap(a_min, a_max, b_min, b_max):
overlap = max(0, min(a_max, b_max) - max(a_min, b_min))
total = max(a_max, b_max) - min(a_min, b_min)
return overlap / total if total > 0 else 0

pairs = [("balance","implicit"),
("balance","best"),
("implicit","best")]


# centering
summary["theta_centered"] = summary["theta_mean_deg"] - 45

sns.set_context(context='poster',font_scale= 0.75)
cmap = sns.diverging_palette(145, 300, as_cmap=True)
summary["pm"] = summary["rsch_prob"].astype(str) + "-" + summary["model"].astype(str)
pivot = summary.pivot(
index="pm",
columns="dataset",
values="theta_centered"
)


plt.figure(figsize=(12,8))

sns.heatmap(
pivot,
cmap=cmap,
center=0,
vmin=-45,
vmax=45,
annot=True,
fmt=".1f"
)


plt.xlabel("Dataset")
plt.ylabel("Research Problem | Model")
plt.tight_layout()
output= os.path.join(GRAPHS,'acc_di_tradeoff_heatmap.png')
plt.savefig(output, dpi=300, bbox_inches='tight')
155 changes: 155 additions & 0 deletions fairnessbench_analysis/acc_fairness_overlap_heatmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS


file= CSV_FILES/'Final_step_perfomance2026-02-07T02:23:17.201613.csv'
perf_alt = pd.read_csv(file)

perf= ['acc','precision', 'recall', 'di', 'statistical_parity_diff',
'equal_opp_diff', 'error_rate_diff', 'error_rate_ratio',
'false_omission_rate_diff','final_flake8_score']

perf_alt = perf_alt.replace([np.inf, -np.inf], np.nan).fillna(0)

# expanding the task to sub columns
task_data_metric = perf_alt['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
wider_code = pd.concat([perf_alt, task_data_metric],axis=1)
wider_cols=['model','task','task_dataset','task_metric','task-dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score']
wider_code = wider_code[wider_cols]

task_task_dem = wider_code['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'rsch_prob',1:'dem'})
wider = pd.concat([wider_code, task_task_dem],axis=1)
cols= ['model','task','task_dataset','task_metric','task-dem','rsch_prob','dem','run_ts','run_id','acc','precision','recall','di','statistical_parity_diff','equal_opp_diff','error_rate_diff','error_rate_ratio','false_omission_rate_diff','final_flake8_score']
wider=wider[cols]
wider=wider.replace('claude-3-7-sonnet-20250219','claude-3-7-sonnet')

# summary df of the mean and std
summary = (
wider.groupby(["model", "task_dataset", "rsch_prob"])
.agg(
mean_acc=("acc","mean"),
mean_di=("di","mean"),
std_acc=("acc","std"),
std_di=("di","std")
)
.reset_index()
)

allowed_datasets = ["randoadult","sampadult","nondescriptive",'health']

results = []

# here we loop over each (model, research problem) group
for (model, rp), group in summary.groupby(["model","rsch_prob"]):

# find baseline (adult)
base_row = group[group["task_dataset"]=="adult"]
#if a model+research-problem doesn’t have an Adult row, I can’t compute diffs, so skip it
if base_row.empty:
continue
# extracting adults mean n std , .iloc[0] grabs the first row value (assumes only one Adult row exists in this group).
base_di = base_row["mean_di"].iloc[0]
base_acc = base_row["mean_acc"].iloc[0]
# build baseline DI interval using variance
base_min_di = base_di - base_row["std_di"].iloc[0]
base_max_di = base_di + base_row['std_di'].iloc[0]
# build baseline ACC interval using std
base_min_acc = base_acc - base_row["std_acc"].iloc[0]
base_max_acc = base_acc + base_row['std_acc'].iloc[0]
# iterate through each dataset std (adult, randoadult, sampadult, …) for this model+rproblem
for _, row in group.iterrows():
# get dataset name for this row
dataset = row["task_dataset"]
# skip baseline itself, don't compare adult to itself
if dataset == "adult":
continue
if dataset not in allowed_datasets:
continue

change_type = "data_change"
if row["task_dataset"] == "nondescriptive":
change_type = "context_change"
row_max_di = (row["mean_di"] + row['std_di'])
row_min_di = (row["mean_di"] - row['std_di'])

row_max_acc = (row["mean_acc"] + row['std_acc'])
row_min_acc = (row["mean_acc"] - row['std_acc'])
results.append({
"model": model,
"rsch_prob": rp,
"baseline_dataset": "adult",
"comparison_dataset": dataset,
# Differences in mean fairness and accuracy, these measure how far the means move from Adult baseline.
"fairness_diff": abs(base_di - row["mean_di"]),
"accuracy_diff": abs(base_acc - row["mean_acc"]),
'min_fair_dff':row_min_di,
'max_fair_diff':row_max_di,
'min_acc_diff':row_min_acc,
'max_acc_diff':row_max_acc,
# compute overlap between baseline interval and comparison interval
# overlap = 0 intervals are disjoint (stronger evidence of change)
# overlap large intervals similar/overlapping (weaker evidence of change)
'overlap_fair': max(0,min(base_max_di,row_max_di)-max(base_min_di,row_min_di)), # width
# Baseline interval length
'len_man_min': base_max_di - base_min_di,
'overlap_acc': max(0,min(base_max_acc,row_max_acc)-max(base_min_acc,row_min_acc)), # width
'len_acc_minmax':base_max_acc - base_min_acc,
"change_type": change_type
})


sens_df = pd.DataFrame(results)
sens_df['final_overlap_fair'] = sens_df['overlap_fair'] / sens_df['len_man_min']
sens_df['final_overlap_acc'] = sens_df['overlap_acc'] / sens_df['len_acc_minmax']

# heatmap for fairness_overlap
heatmap_data = sens_df.pivot_table(
index=["rsch_prob", "model"],
columns="comparison_dataset",
values="final_overlap_fair",
aggfunc="mean"
).sort_index(level=["rsch_prob", "model"])

plt.figure(figsize=(12,6))

sns.heatmap(
heatmap_data,
annot=True,
cmap="viridis",
linewidths=0.5
)

plt.title("Fairness Sensitivity Across Dataset Variants")
plt.xlabel("Comparison Dataset")
plt.ylabel("Research Problem | Model")

output= os.path.join(GRAPHS,'fairness_overlap_heatmap.png')
plt.savefig(output, dpi=300, bbox_inches='tight')

# heatmap for acc_overlap
heatmap_data = sens_df.pivot_table(
index=["rsch_prob", "model"],
columns="comparison_dataset",
values="final_overlap_acc",
aggfunc="mean"
).sort_index(level=["rsch_prob", "model"])

plt.figure(figsize=(12,6))

sns.heatmap(
heatmap_data,
annot=True,
cmap="viridis",
linewidths=0.5
)

plt.title("Accuracy Sensitivity Across Dataset Variants")
plt.xlabel("Comparison Dataset")
plt.ylabel("Research Problem | Model")
output= os.path.join(GRAPHS,'acc_overlap_heatmap.png')
plt.savefig(output, dpi=300, bbox_inches='tight')
59 changes: 59 additions & 0 deletions fairnessbench_analysis/adult_di_code_llmeval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from path import CSV_FILES,GRAPHS
# Loading useful dataframes
file = CSV_FILES/'Result_Final_code_clean2025-09-18T00:48:40.584077.csv'
code_eval = pd.read_csv(file)


# Removing missing rows fairnessBench
code_eval= code_eval.dropna(how="any")

task_data_metric = code_eval['task'].str.split('_').apply(pd.Series).rename(columns ={0:'task_dataset',1:'task_metric',2:'task-dem'})
task_data_dem = task_data_metric['task-dem'].str.split('-').apply(pd.Series).rename(columns ={0:'resrch_prob',1:'dem'})
wider_code = pd.concat([code_eval, task_data_metric,task_data_dem],axis=1)
wider_cols=['model','task','task_dataset','task_metric','resrch_prob','dem','run_ts','run_id','total_llm_score',"1. Data Collection and Processing","2. Bias Detection and Mitigation","3. Fairness Metric Selection","4. Model Selection and Training","5. Evaluation and Testing"]
wider_code = wider_code[wider_cols]

# filtering the adult dataset and the di task_metric
adult= wider_code[wider_code['task_dataset']=='adult']
adult_di=adult[adult['task_metric']=='di']

long_df = adult_di.melt(
id_vars=['model','task_dataset','task_metric','resrch_prob','dem'],
value_vars=[
'1. Data Collection and Processing',
'2. Bias Detection and Mitigation',
'3. Fairness Metric Selection',
'4. Model Selection and Training',
'5. Evaluation and Testing'
],
var_name='rubric_section',
value_name='score'
)

sns.set_context(context='poster',font_scale=1.0)
plt.figsize=(16,12)
m=sns.catplot(
data=long_df,
x="rubric_section",
y="score",
hue="model",
col="resrch_prob",
row='dem',
kind="bar",
aspect=2
)
m.set_titles(template='{col_var}: {col_name}\n{row_var}: {row_name}')
ax=m.axes
ax = m.axes
for ax in m.axes.flatten():
plt.setp(ax.get_xticklabels(), rotation=30)
ax.axhline(y=4.0, color='black', linestyle='-.', alpha=0.3)

output = os.path.join(GRAPHS, 'adult_di_code_llm_eval.png')
plt.savefig(output, dpi=400 , bbox_inches='tight')
Loading