-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
66 lines (54 loc) · 1.96 KB
/
preprocess.py
File metadata and controls
66 lines (54 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from datetime import datetime
import numpy as np
import pandas as pd
# from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from holidays.utils import country_holidays
import helpers
## create a calendar of the holidays. This date range won't be the
## final date range of the DataFrame, I just want to be sure to
## include all the dates.
# date_range = pd.date_range(start='2019-01-01', end='2024-12-01')
# cal = calendar()
# holidays = cal.holidays(start=date_range.min(), end=date_range.max())
def preprocess_data(weather_dat,nrg_dat,savefile=''):
holidays = country_holidays("US", years=np.arange(2019, 2024, 1))
time_deltas = [1, 2, 19, 20, 21, 22, 23, 24, 25, 26, 49, 168] ## in hours
columns = [
"DA_price",
"RT_price",
"load",
"temp",
"dwpt",
"nat_gas_spot_price",
"monthly_avg_NY_natgas_price",
]
data_frame = helpers.MainData(
read_from_file=False,
include_natgas=True,
weather_data_loc=weather_dat,
nrg_data_loc=nrg_dat,
holiday_calendar=holidays,
time_deltas=time_deltas,
columns=columns,
)
data_frame.process_data()
data_frame.add_previous_day_price_load_averages(day_ahead=False)
df = data_frame.data
df = df[df.time.between(datetime(2010, 1, 1), datetime(2023, 8, 31, 23, 59, 59))]
columns_ = df.columns.to_list()
columns_.remove("time")
for v in columns:
columns_.remove(v)
columns = ["time", *columns, *columns_]
df = df[columns]
index_of_first_non_null = df[~df.avg_actual_load_prev_day.isna()].index[0]
df = df[index_of_first_non_null:]
df.interpolate(method="linear", limit_direction="forward", inplace=True, axis=0)
print(f"There are {df.isnull().values.sum()} missing values or NaNs in the DataFrame.")
if savefile:
df.to_csv(
'%s.csv' % savefile,
float_format="%0.5f",
index=False,
)
return df