Mitigation_EntryPoints_CodeRepo/Coal_Robustness_Checks.py at main · Wegatriespython/Mitigation_EntryPoints_CodeRepo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import pandas as pd
import os
import sys
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from src.data_processing.general_preprocessing import load_and_preprocess
from src.analysis.random_forest import create_feature_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.ensemble import RandomForestClassifier

# File paths and settingsf
#
file_name = "Codebook_Coal_Cleanv2.xlsm"
script_dir = r"C:\Users\vigne\OneDrive - Wageningen University & Research\Internship\Literature Review\Final Data Processing\Mitigation_EntryPoints_CodeRepo\data\raw\Codebook_Coal_Cleanv2.xlsm"

# Go up one level to the parent directory
parent_dir = os.path.dirname(script_dir)
INPUT_FILE = script_dir
OUTPUT_FILE = "coal_cluster_analysis_results.xlsx"
CLUSTER_COLUMN = "Cluster"
ENABLER_COLUMN = "Enabler"
ENTRY_COLUMN = "Entry (policy intervention)"

def run_coal_cluster_analysis():
    # Load and preprocess data
    df, vectorized_data = load_and_preprocess(INPUT_FILE, ENABLER_COLUMN, ENTRY_COLUMN, CLUSTER_COLUMN)

    # Create feature matrix
    feature_matrix, feature_names, enabler_features = create_feature_matrix(df, ENABLER_COLUMN, ENTRY_COLUMN)

    # Encode cluster labels
    le = LabelEncoder()
    y = le.fit_transform(df[CLUSTER_COLUMN])
    cluster_names = le.classes_

    # Initialize results dictionary
    results = {
        'Cluster': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': [],
        'ROC AUC': []
    }

    # Run analysis for each cluster
    for i, cluster in enumerate(cluster_names):
        print(f"Analyzing cluster: {cluster}")

        # Create binary target for current cluster
        y_binary = (y == i).astype(int)

        # Determine appropriate sampling method and cross-validation strategy
        min_samples = np.min(np.bincount(y_binary))
        if min_samples == 1:
            sampler = RandomOverSampler(random_state=42)
            cv = LeaveOneOut()
        else:
            k_neighbors = min(5, min_samples - 1)
            sampler = SMOTE(random_state=42, k_neighbors=k_neighbors)
            cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

        X_resampled, y_resampled = sampler.fit_resample(feature_matrix, y_binary)

        # Initialize and train Random Forest
        rf = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')

        # Perform cross-validation
        cv_accuracy = []
        cv_precision = []
        cv_recall = []
        cv_f1 = []
        cv_roc_auc = []

        for train_index, val_index in cv.split(X_resampled, y_resampled):
            X_train, X_val = X_resampled[train_index], X_resampled[val_index]
            y_train, y_val = y_resampled[train_index], y_resampled[val_index]

            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            y_pred_proba = rf.predict_proba(X_val)[:, 1]

            cv_accuracy.append(accuracy_score(y_val, y_pred))
            cv_precision.append(precision_score(y_val, y_pred, zero_division=0))
            cv_recall.append(recall_score(y_val, y_pred, zero_division=0))
            cv_f1.append(f1_score(y_val, y_pred, zero_division=0))
            cv_roc_auc.append(roc_auc_score(y_val, y_pred_proba))

        # Store results
        results['Cluster'].append(cluster)
        results['Accuracy'].append(np.mean(cv_accuracy))
        results['Precision'].append(np.mean(cv_precision))
        results['Recall'].append(np.mean(cv_recall))
        results['F1 Score'].append(np.mean(cv_f1))
        results['ROC AUC'].append(np.mean(cv_roc_auc))

        print(f"ROC AUC: {results['ROC AUC'][-1]:.4f}")
        print("-----------------------------")

    # Create DataFrame from results
    results_df = pd.DataFrame(results)

    # Calculate mean scores
    mean_scores = results_df.mean(numeric_only=True)
    mean_scores_df = pd.DataFrame(mean_scores).T
    mean_scores_df['Cluster'] = 'Mean'

    # Combine results and mean scores
    final_results = pd.concat([results_df, mean_scores_df], ignore_index=True)

    # Save results to Excel
    final_results.to_excel(OUTPUT_FILE, index=False, float_format='%.3f')
    print(f"Results saved to {OUTPUT_FILE}")

    # Print results table
    print("\n Coal Cluster Analysis Results:")
    print(final_results.to_string(index=False))

if __name__ == "__main__":
    run_coal_cluster_analysis()