-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCoal_Robustness_Checks.py
More file actions
122 lines (99 loc) · 4.49 KB
/
Coal_Robustness_Checks.py
File metadata and controls
122 lines (99 loc) · 4.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import pandas as pd
import os
import sys
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from src.data_processing.general_preprocessing import load_and_preprocess
from src.analysis.random_forest import create_feature_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.ensemble import RandomForestClassifier
# File paths and settingsf
#
file_name = "Codebook_Coal_Cleanv2.xlsm"
script_dir = r"C:\Users\vigne\OneDrive - Wageningen University & Research\Internship\Literature Review\Final Data Processing\Mitigation_EntryPoints_CodeRepo\data\raw\Codebook_Coal_Cleanv2.xlsm"
# Go up one level to the parent directory
parent_dir = os.path.dirname(script_dir)
INPUT_FILE = script_dir
OUTPUT_FILE = "coal_cluster_analysis_results.xlsx"
CLUSTER_COLUMN = "Cluster"
ENABLER_COLUMN = "Enabler"
ENTRY_COLUMN = "Entry (policy intervention)"
def run_coal_cluster_analysis():
# Load and preprocess data
df, vectorized_data = load_and_preprocess(INPUT_FILE, ENABLER_COLUMN, ENTRY_COLUMN, CLUSTER_COLUMN)
# Create feature matrix
feature_matrix, feature_names, enabler_features = create_feature_matrix(df, ENABLER_COLUMN, ENTRY_COLUMN)
# Encode cluster labels
le = LabelEncoder()
y = le.fit_transform(df[CLUSTER_COLUMN])
cluster_names = le.classes_
# Initialize results dictionary
results = {
'Cluster': [],
'Accuracy': [],
'Precision': [],
'Recall': [],
'F1 Score': [],
'ROC AUC': []
}
# Run analysis for each cluster
for i, cluster in enumerate(cluster_names):
print(f"Analyzing cluster: {cluster}")
# Create binary target for current cluster
y_binary = (y == i).astype(int)
# Determine appropriate sampling method and cross-validation strategy
min_samples = np.min(np.bincount(y_binary))
if min_samples == 1:
sampler = RandomOverSampler(random_state=42)
cv = LeaveOneOut()
else:
k_neighbors = min(5, min_samples - 1)
sampler = SMOTE(random_state=42, k_neighbors=k_neighbors)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
X_resampled, y_resampled = sampler.fit_resample(feature_matrix, y_binary)
# Initialize and train Random Forest
rf = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
# Perform cross-validation
cv_accuracy = []
cv_precision = []
cv_recall = []
cv_f1 = []
cv_roc_auc = []
for train_index, val_index in cv.split(X_resampled, y_resampled):
X_train, X_val = X_resampled[train_index], X_resampled[val_index]
y_train, y_val = y_resampled[train_index], y_resampled[val_index]
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
y_pred_proba = rf.predict_proba(X_val)[:, 1]
cv_accuracy.append(accuracy_score(y_val, y_pred))
cv_precision.append(precision_score(y_val, y_pred, zero_division=0))
cv_recall.append(recall_score(y_val, y_pred, zero_division=0))
cv_f1.append(f1_score(y_val, y_pred, zero_division=0))
cv_roc_auc.append(roc_auc_score(y_val, y_pred_proba))
# Store results
results['Cluster'].append(cluster)
results['Accuracy'].append(np.mean(cv_accuracy))
results['Precision'].append(np.mean(cv_precision))
results['Recall'].append(np.mean(cv_recall))
results['F1 Score'].append(np.mean(cv_f1))
results['ROC AUC'].append(np.mean(cv_roc_auc))
print(f"ROC AUC: {results['ROC AUC'][-1]:.4f}")
print("-----------------------------")
# Create DataFrame from results
results_df = pd.DataFrame(results)
# Calculate mean scores
mean_scores = results_df.mean(numeric_only=True)
mean_scores_df = pd.DataFrame(mean_scores).T
mean_scores_df['Cluster'] = 'Mean'
# Combine results and mean scores
final_results = pd.concat([results_df, mean_scores_df], ignore_index=True)
# Save results to Excel
final_results.to_excel(OUTPUT_FILE, index=False, float_format='%.3f')
print(f"Results saved to {OUTPUT_FILE}")
# Print results table
print("\n Coal Cluster Analysis Results:")
print(final_results.to_string(index=False))
if __name__ == "__main__":
run_coal_cluster_analysis()