-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathaverageRunsCollectAndLatex.py
More file actions
117 lines (92 loc) · 4.1 KB
/
averageRunsCollectAndLatex.py
File metadata and controls
117 lines (92 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from pathlib import Path
from typing import Tuple, Union
import pandas as pd
def average_time_columns_by_label_to_latex_pretty(
csv_path: Union[str, Path],
label_col: str = "label",
float_precision: int = 3,
output_csv: Union[str, Path, None] = None,
output_tex: Union[str, Path, None] = None,
) -> Tuple[pd.DataFrame, str]:
"""
Read a CSV of multiple runs, average all columns starting with 'time_' grouped by
['database', label] when 'database' exists, otherwise grouped by [label]. Keep
non-time columns using the first value per group. Drop 'run' and any 'Unnamed:*'
columns. Preserve the original column order from the CSV (minus the dropped ones).
Sort rows by database then label (or just label). Return (DataFrame, LaTeX string).
Parameters
----------
csv_path : str | Path
Input CSV path.
label_col : str
The label column name.
float_precision : int
Number of decimal places for numeric display in LaTeX.
output_csv : optional
If provided, write the averaged table to this CSV path.
output_tex : optional
If provided, write the LaTeX table to this path.
Returns
-------
(df_out, latex_str) : (pd.DataFrame, str)
"""
csv_path = Path(csv_path)
df = pd.read_csv(csv_path)
if label_col not in df.columns:
msg = f"Label column '{label_col}' not found. Columns: {list(df.columns)}"
raise KeyError(msg)
# Drop 'run' and any Unnamed:* columns
cols_to_drop = set()
if "run" in df.columns:
cols_to_drop.add("run")
unnamed_mask = df.columns.str.match(r"^Unnamed", na=False)
cols_to_drop.update(df.columns[unnamed_mask].tolist())
df = df.drop(columns=list(cols_to_drop), errors="ignore")
# Preserve original column order *after* drops
original_cols = df.columns.tolist()
# Choose grouping keys
group_keys = ["database", label_col] if "database" in df.columns else [label_col]
# Identify time_* columns (preserve their order as in the CSV)
time_cols = [c for c in original_cols if c.startswith("time_")]
# Identify non-time columns excluding group keys
other_cols = [c for c in original_cols if c not in time_cols and c not in group_keys]
# Group and aggregate
g = df.groupby(group_keys, dropna=False)
# Mean for time_* columns; 'first' for non-time columns
avg_time = g[time_cols].mean(numeric_only=True) if len(time_cols) else pd.DataFrame(index=g.size().index)
keep_other = g[other_cols].first() if len(other_cols) else pd.DataFrame(index=g.size().index)
out = keep_other.join(avg_time)
# Bring group keys back as columns
out = out.reset_index()
# Reorder columns exactly as in the CSV (after drops)
final_cols = [c for c in original_cols if c in out.columns]
out = out[final_cols]
# Sort rows by database then label (or just label)
sort_keys = [k for k in ["database", label_col] if k in out.columns]
if sort_keys:
out = out.sort_values(by=sort_keys, kind="stable")
# ===== Prettify LaTeX =====
# Build a column_format string: right-align numeric columns, left-align others
num_cols = set(out.select_dtypes(include="number").columns.tolist())
colfmt = "".join("r" if c in num_cols else "l" for c in out.columns)
# Use Styler for better LaTeX with hrules and alignment
styler = out.style.hide(axis="index").format(precision=float_precision)
latex_str = styler.to_latex(hrules=True, column_format=colfmt)
# Optionally write files
if output_csv:
Path(output_csv).write_text(out.to_csv(index=False), encoding="utf-8")
if output_tex:
Path(output_tex).write_text(latex_str, encoding="utf-8")
return out, latex_str
# ===== Example usage =====
if __name__ == "__main__":
df_out, latex = average_time_columns_by_label_to_latex_pretty(
csv_path="reports/results_21-10-25:16:51:24.csv",
label_col="label",
float_precision=2,
output_csv="averaged_time_by_label.csv",
output_tex="averaged_time_by_label.tex",
)
print(df_out.head())
print("\n===== LaTeX Preview =====\n")
print(latex)