Applied-Data-Science-1/ADS1_Assignment3.py at main · uresha1995/Applied-Data-Science-1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 11 21:44:41 2024

@author: uresha

"""
from scipy.optimize import curve_fit
import scipy.optimize as opt
import seaborn as sns
import sklearn.metrics as skmet
import sklearn.preprocessing as pp
from sklearn import cluster
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning,
                        message="KMeans is known to have a memory leak on Windows with MKL")

# Read file


def data():
    """
    Read and return the data File
    """
    co2_cap = pd.read_csv(
        r'C:\Users\uresha\Dropbox\PC\Desktop\UH\2. Applied Data Science 1\Assignment 3\fin\CO2 per $ of GDP.csv')
    return co2_cap


co2_cap = data()
print(co2_cap)

# From 1990 to 2020, data use for clustering
# Countries with one NaN drop from data set
co2_cap = co2_cap[(co2_cap["1990"].notna()) & (co2_cap["2020"].notna())]
co2_cap = co2_cap.reset_index(drop=True)

# Check types
print(co2_cap.dtypes)

# Extract the year 1990
increment = co2_cap[["Country Name", "1990"]].copy()

# Calculate the increment of co2 vs gdp over 30 years
increment["Growth"] = 100.0 / 30.0 * \
    (co2_cap["2020"] - co2_cap["1990"]) / co2_cap["1990"]
print(increment.describe())
print()

# Identify the data type
print(increment.dtypes)

# Plot the graph
plt.figure(figsize=(10, 10))
plt.scatter(increment["1990"], increment["Growth"])

plt.xlabel("CO2 emission per unit of GDP 1990")
plt.ylabel("GDP growth per year %")
plt.title("CO2 emission vs GDP Growth per year")
plt.show()

# Creat a scaler object
scaler = pp.RobustScaler()

# Set up the scaler object and extract the columns for clustering
df_ex = increment[["1990", "Growth"]].copy()

# Check and replace any infinite or extremely large values
df_ex.replace([np.inf, -np.inf], np.nan, inplace=True)
df_ex.fillna(df_ex.max(), inplace=True)

scaler.fit(df_ex)

# Apply the scaling
norm = scaler.transform(df_ex)

# Plot the graph
plt.figure(figsize=(10, 10))
plt.scatter(norm[:, 0], norm[:, 1])

plt.xlabel("CO2 emission per unit of GDP 1990")
plt.ylabel("GDP growth per year %")
plt.title("CO2 emission vs GDP Growth per year_scaler")
plt.show()


def silhoutte_value(xy, n):
    """
    This function is use to calculate the silhoutte score for n clusters
    """

    # set up the clusters with the expected clusters
    kmeans = cluster.KMeans(n_clusters=n, n_init=20)

    # fit the data, result are sorted in kmeans object
    kmeans.fit(xy)

    labels = kmeans.labels_

    # calculate the silhoutte score
    score = (skmet.silhouette_score(xy, labels))

    return score


# Calculate silhouette score for 3 to 10 clusters
for ic in range(2, 11):
    score = silhoutte_value(norm, ic)
    print(f"The silhouette score for {ic: 3d} is {score: 7.4f}")

# Set up the clusters with the number of expected clusters
kmeans = cluster.KMeans(n_clusters=3, n_init=20)

# Fit the data and stored results in the kmeans object
kmeans.fit(norm)

# Extract the cluster labels
labels = kmeans.labels_
# Add cluster labels to the original DataFrame
co2_cap['Cluster'] = labels

# Extract the estimated cluster centres and convert to original scales
cen = kmeans.cluster_centers_
cen = scaler.inverse_transform(cen)
xkmeans = cen[:, 0]
ykmeans = cen[:, 1]

print(df_ex)

# Print the countries in each cluster
for cluster_num in range(kmeans.n_clusters):
    cluster_countries = co2_cap[co2_cap['Cluster']
                                == cluster_num]['Country Name'].tolist()
    print(f"\nCluster {cluster_num + 1} Countries:")
    print(", ".join(cluster_countries))

plt.figure(figsize=(8, 8))
cm = matplotlib.colormaps["Paired"]

# Plot data with kmeans cluster number
plt.scatter(increment["1990"], increment["Growth"],
            s=30, c=labels, cmap=cm, marker="o")

# Show cluster centres with different marker and larger size
plt.scatter(xkmeans, ykmeans, s=50, c="k", marker="D", label="Cluster Centers")

plt.xlabel("CO2 emission per head 1990", fontsize=18)
plt.ylabel("GDP increment per year [%]", fontsize=18)
plt.legend(fontsize=18)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.show()

# DATA FITTING


def read_data():
    """
    Read and return the data File
    """
    original_dataset = pd.read_csv(
        r'C:\Users\uresha\Dropbox\PC\Desktop\UH\2. Applied Data Science 1\Assignment 3\fin\GDP per Capita.csv')
    return original_dataset


dataset = read_data()
print(dataset)

# Read and transpose the data


def transpose_and_clean_dataset(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Preparing dataset
    columns_to_remove = ['Indicator Code', 'Country Code']
    df = df.drop(columns=columns_to_remove)

    # Transpose the years to a 'Year' column
    df = df.melt(id_vars=['Indicator Name', 'Country Name'],
                 var_name='Year', value_name='Value')

    # Set 'Year' as the new index
    df = df.set_index('Year')

    return df


# Actual CSV file
result_df = transpose_and_clean_dataset(
    r'C:\Users\uresha\Dropbox\PC\Desktop\UH\2. Applied Data Science 1\Assignment 3\fin\GDP per Capita.csv')

# Print DataFrame
print(result_df)

# Save the transposed DataFrame to a new CSV file
result_df.to_csv(
    r'C:\Users\uresha\Dropbox\PC\Desktop\UH\2. Applied Data Science 1\Assignment 3\fin\transposed_dataset.csv')

# Explore the statistical properties
countries = dataset['Country Name'].unique()

# Convert the 'Year' index to a column
result_df.reset_index(inplace=True)

# Convert the 'Year' column to a numeric type
result_df['Year'] = pd.to_numeric(result_df['Year'].str.extract(r'(\d+)')[0])

# Plotting the data
plt.figure(figsize=(10, 8))
sns.lineplot(x='Year', y='Value', data=result_df, ci=None)
plt.title('GDP per Capita Over the Past Years')
plt.xlabel('Year')
plt.ylabel('GDP per Capita ($)')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

info = result_df.iloc[1:20]
sns.lineplot(x='Year', y='Value', data=result_df, ci=None)
plt.show()

# Exponential function


def exp(t, n0, g):
    """
    Calculates exponential function with scale factor n0 and growth rate g.
    """

    t = t - 1990
    f = n0 * np.exp(g * t)
    return f

# Check for NaN and inf values in the dataset
    """
    Since array must not contain infs or NaNs. Nan and inf values replace by 0
    """


nan_mask = np.isnan(result_df["Value"])
inf_mask = np.isinf(result_df["Value"])

# Replace NaN and inf values
result_df["Value"][nan_mask] = 0
result_df["Value"][inf_mask] = 0

# Parameter estimation using curve_fit
param, covar = opt.curve_fit(
    exp, result_df["Year"], result_df["Value"], p0=[1.2e12, 0.03])

# Print the results
print("GDP 1990:", param[0] / 1e9)
print("Growth rate:", param[1])

"""
    Since this is not a good way. Since the year-by-year data are highly correlated.
    Let curve_fit find the errors.
    """

# Plotting the exponential function along with the data
plt.figure(figsize=(10, 8))

# Plot the original data
sns.lineplot(x='Year', y='Value', data=result_df, ci=None)

# Plot the fitted exponential function
plt.plot(result_df["Year"], exp(result_df["Year"], *param),
         label='Trail data fit', linestyle='--')

plt.title('GDP per Capita Over the Past Years')
plt.xlabel('Year')
plt.ylabel('GDP per Capita ($)')
plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1))
plt.show()

# Using curve_fit for the logistic function


def logistic(t, n0, g, t0):
    """
    Calculates the logistic function with scale factor n0, growth rate g,
    and inflection point t0.
    """
    f = n0 / (1 + np.exp(-g*(t - t0)))
    return f


def plot_gdp_with_fits(result_df):
    # Check for NaN and inf values in the dataset
    nan_mask = np.isnan(result_df["Value"])
    inf_mask = np.isinf(result_df["Value"])

    # Replace NaN and inf values
    result_df["Value"][nan_mask] = 0
    result_df["Value"][inf_mask] = 0

    # Parameter estimation using curve_fit for the exponential function
    param_exp, covar_exp = curve_fit(
        exp, result_df["Year"], result_df["Value"], p0=[1.2e12, 0.03])

    # Parameter estimation using curve_fit for the logistic function
    param_logistic, covar_logistic = curve_fit(
        logistic, result_df["Year"], result_df["Value"], p0=[3e12, param_exp[1], 1990])

    # Print the exponential and logistic function parameter results
    print("Logistic Initial GDP:", param_logistic[0] / 1e9)
    print("Logistic Growth rate:", param_logistic[1])
    print("Logistic Inflection point:", param_logistic[2])

    plt.figure(figsize=(15, 10))

    # Plot the original data
    sns.lineplot(x='Year', y='Value', data=result_df, ci=None, label='GDP')

    # Plot the fitted logistic function
    plt.plot(result_df["Year"], logistic(result_df["Year"], *param_logistic),
             label='Logistic fit', linestyle='-')

    plt.title('GDP per Capita Over the Past Years')
    plt.xlabel('Year')
    plt.ylabel('GDP per Capita ($)')
    plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1))
    plt.show()


# Plot graph
plot_gdp_with_fits(result_df)


def logistic_offset(t, n0, g, t0, offset):
    """Calculates the logistic function with an offset."""
    f = offset + n0 / (1 + np.exp(-g * (t - t0)))
    return f


# Adding an offset parameter
param, covar = curve_fit(
    logistic_offset, result_df["Year"], result_df["Value"], p0=[1e12, 0.5, 1990, 5000])

# Plot the original data
sns.lineplot(x='Year', y='Value', data=result_df, ci=None, label='GDP')

# Plot the fitted logistic curve with the adjusted starting point
plt.plot(result_df["Year"], logistic_offset(result_df["Year"],
         *param), label='Logistic Fit', linestyle='--', color='red')

# Plot graph
plt.title('GDP per Capita Over Past Years with Logistic Fit')
plt.xlabel('Year')
plt.ylabel('GDP per Capita Growth')
plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1))
plt.show()

# Extract variances and take square root to get sigmas
var = np.diag(covar)
sigma = np.sqrt(var)
print(f"Turning point {param[2]: 6.1f} +/- {sigma[2]: 4.1f}")
print(f"GDP at turning point {param[0]: 7.3e} +/- {sigma[0]: 7.3e}")
print(f"Growth rate {param[1]: 6.4f} +/- {sigma[1]: 6.4f}")


forecast_years = np.linspace(1990, 2030, 100)

# Use the logistic function with the fitted parameters to generate the forecast
forecast_values = logistic_offset(forecast_years, *param)

# Plot graph
plt.figure(figsize=(15, 10))
sns.lineplot(x=result_df["Year"], y=result_df["Value"], label="GDP",  ci=None)
sns.lineplot(x=forecast_years, y=forecast_values, label="Forecast")
plt.xlabel("Year")
plt.ylabel("GDP per Capita Growth")
plt.legend()
plt.title('GDP per Capita Forecast')
plt.show()


# Create an array and forecast for next 10 year
def deriv(x, func, parameter, i, h=1e-5):
    """
    Calculate the numerical derivative of a function with respect to the
    i-th parameter.
    """
    params_plus_h = parameter.copy()
    params_plus_h[i] += h
    deriv = (func(x, *params_plus_h) - func(x, *parameter)) / h
    return deriv


def error_prop(x, func, parameter, covar):
    """
    Calculates 1 sigma error ranges for a number or array using error
    propagation with variances and covariances taken from the covar matrix.
    Derivatives are calculated numerically.
    """
    var = np.zeros_like(x)   # initialize variance vector

    # Nested loop over all combinations of the parameters
    for i in range(len(parameter)):
        # derivative with respect to the ith parameter
        deriv1 = deriv(x, func, parameter, i)

        for j in range(len(parameter)):
            # derivative with respect to the jth parameter
            deriv2 = deriv(x, func, parameter, j)

            # covariance matrix element
            covar_ij = covar[i, j]

            # add to the variance vector
            var += deriv1 * covar_ij * deriv2

    sigma = np.sqrt(var)
    return sigma


year = np.linspace(1990, 2030, 100)
forecast = logistic_offset(year, *param)

# Using error propagation
sigma = error_prop(year, logistic_offset, param, covar)
up = forecast + sigma
low = forecast - sigma

# Forecast plot for all countries
plt.figure(figsize=(15, 10))
sns.lineplot(x=result_df["Year"], y=result_df["Value"], label="GDP", ci=None)
sns.lineplot(x=year, y=forecast, label="Forecast", ci=None)
plt.fill_between(year, low, up, color="yellow", alpha=0.7)
plt.xlabel("Year", fontsize=18)
plt.ylabel("GDP per Capita Growth", fontsize=18)
plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1), fontsize=18)
plt.title('GDP per Capita Growth Forecast - Global Level ', fontsize=18)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.show()

# Forecast plot for Randomly selected countries
"""Countries were selected randomly to represent the each cluster. For selected
        three countries forcast produce. Nepal selected from cluster 1, Norway from
        cluster 2 and United States from cluster 3"""


# Forecast for Nepal - Cluster 1
nepal_data = result_df[result_df['Country Name'] == 'Nepal']

# Parameter estimation using curve_fit for the exponential function
param_exp, covar_exp = curve_fit(
    exp, nepal_data["Year"], nepal_data["Value"], p0=[1.2e12, 0.03])

# Parameter estimation using curve_fit for the logistic function
param_logistic, covar_logistic = curve_fit(
    logistic, nepal_data["Year"], nepal_data["Value"], p0=[3e12, param_exp[1], 1990])

# Forecasting up to 2030
forecast_years = np.arange(nepal_data["Year"].min(), 2031, 1)

# Plot the original data for Nepal
plt.figure(figsize=(15, 10))
sns.lineplot(x='Year', y='Value', data=nepal_data, ci=None,
             label='GDP', color='green', linewidth=2)

# Plot the fitted logistic function for Nepal
plt.plot(nepal_data["Year"], logistic(
    nepal_data["Year"], *param_logistic), label='Logistic fit', linestyle='-', color='red', linewidth=2)

# Plot the forecasted values
plt.plot(forecast_years, logistic(forecast_years, *param_logistic),
         label='Forecast', linestyle='--')

plt.title('GDP per Capita Forcast (Cluster 1) - Nepal', fontsize=18)
plt.xlabel('Year', fontsize=18)
plt.ylabel('GDP per Capita ($)', fontsize=18)
plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1), fontsize=18)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.show()


# Forecast for Norway - Cluster 2
norway_data = result_df[result_df['Country Name'] == 'Norway']

# Parameter estimation using curve_fit for the exponential function
param_exp, covar_exp = curve_fit(
    exp, norway_data["Year"], norway_data["Value"], p0=[1.2e12, 0.03])

# Parameter estimation using curve_fit for the logistic function
param_logistic, covar_logistic = curve_fit(
    logistic, norway_data["Year"], norway_data["Value"], p0=[3e12, param_exp[1], 1990])

# Forecasting up to 2030
forecast_years = np.arange(norway_data["Year"].min(), 2031, 1)

# Plot the original data for New Zealand
plt.figure(figsize=(15, 10))
sns.lineplot(x='Year', y='Value', data=norway_data, ci=None,
             label='GDP', color='green', linewidth=2)

# Plot the fitted logistic function for New Zealand
plt.plot(norway_data["Year"], logistic(
    norway_data["Year"], *param_logistic), label='Logistic fit', linestyle='-', color='red', linewidth=2)

# Plot the forecasted values
plt.plot(forecast_years, logistic(forecast_years, *param_logistic),
         label='Forecast', linestyle='--')

plt.title('GDP per Capita Forcast (Cluster 2) - Norway', fontsize=18)
plt.xlabel('Year', fontsize=18)
plt.ylabel('GDP per Capita ($)', fontsize=18)
plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1), fontsize=18)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.show()


# Forecast for China - Cluster 3
china_data = result_df[result_df['Country Name'] == 'China']

# Parameter estimation using curve_fit for the exponential function
param_exp, covar_exp = curve_fit(
    exp, china_data["Year"], china_data["Value"], p0=[1.2e12, 0.03])

# Parameter estimation using curve_fit for the logistic function
param_logistic, covar_logistic = curve_fit(
    logistic, china_data["Year"], china_data["Value"], p0=[3e12, param_exp[1], 1990])

# Forecasting up to 2030
forecast_years = np.arange(china_data["Year"].min(), 2031, 1)

# Plot the original data for United States
plt.figure(figsize=(15, 10))
sns.lineplot(x='Year', y='Value', data=china_data, ci=None,
             label='GDP', color='green', linewidth=2)

# Plot the fitted logistic function for United States
plt.plot(china_data["Year"], logistic(
    china_data["Year"], *param_logistic), label='Logistic fit', linestyle='-', color='red', linewidth=2)

# Plot the forecasted values
plt.plot(forecast_years, logistic(forecast_years, *param_logistic),
         label='Forecast', linestyle='--')

plt.title('GDP per Capita Forcast (Cluster 3) - China', fontsize=18)
plt.xlabel('Year', fontsize=18)
plt.ylabel('GDP per Capita ($)', fontsize=18)
plt.legend(loc='upper left', bbox_to_anchor=(0, 0, 1, 1), fontsize=18)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.show()