-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsetUpSacramentoModelFiles.py
More file actions
93 lines (70 loc) · 4.13 KB
/
setUpSacramentoModelFiles.py
File metadata and controls
93 lines (70 loc) · 4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas
import thesisFunctions
# Build dataset of all months & regions
sacBasePath = 'SacramentoModel/'
rfDataPath = '../RF_model_data/data/model_training/'
regions = ['IntMnt', 'Xeric', 'CoastMnt']
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
# Get gages for the Sacramento basin
sacGagesFile = sacBasePath + 'Sac_gages_list.csv'
sacGagesDF = pandas.read_csv(sacGagesFile)
sacRefGages = sacGagesDF.STAID[sacGagesDF.CLASS == 'Ref'].map(lambda x: str(x)).values.tolist()
# Build DataFrame in which to accumulate training data
allTrainingDF = pandas.DataFrame()
for month in months:
print()
print('Processing month:', month.title())
# Build DataFrame in which to accumulate prediction data
monthPredictionDF = pandas.DataFrame()
for region in regions:
print(region, end=' ', flush=True)
# Get training data from matching directory used to create the original random forest model, writing "ObsID" as
# header for first column and deleting the 'X.1' mystery variable when it shows up
sourceFile = rfDataPath + region.lower() + '/' + month + '_' + region + '_ref.csv'
regionTrainingDF = pandas.read_csv(sourceFile)
regionTrainingDF.rename(columns={'Unnamed: 0': 'ObsID'}, inplace=True)
badVariable = 'X.1'
if badVariable in regionTrainingDF.columns.values:
regionTrainingDF.drop(badVariable, axis=1, inplace=True)
# Drop "T" from beginning of STAID column (someone added it as a string casting hack before I got the data)
regionTrainingDF['STAID'] = regionTrainingDF['STAID'].map(lambda x: x[1:])
# Subset all training data to just those observations in the Sacramento basin
regionTrainingDF = regionTrainingDF[regionTrainingDF.STAID.isin(sacRefGages)]
# Get prediction data
regionPredictionDF = thesisFunctions.prepSacramentoData(month, region)
# Hack: If we're in the CoastMnt region loop, reassign region as 'IntMnt' because the only Sacramento reference
# gage in that region's source file, 11371000, is actually in the West Mnt region in the Gages II database. None
# of the prediction data is in the CoastMnt region so it won't affect anything in that dataframe.
if region == 'CoastMnt':
regionForDF = 'IntMnt'
else:
regionForDF = region
# Add columns that ID current region and month (1 if True, 0 if False) to each DataFrame, leaving off last one
# in each list to prevent model being over-specified (actually the last two for the regions, because of the
# 11371000 misclassification as CoastMnt explained above
for regionColumn in regions[:-2]:
if regionColumn == regionForDF:
regionTrainingDF[regionColumn] = 1
regionPredictionDF[regionColumn] = 1
else:
regionTrainingDF[regionColumn] = 0
regionPredictionDF[regionColumn] = 0
for monthColumn in months[:-1]:
if monthColumn == month:
regionTrainingDF[monthColumn] = 1
regionPredictionDF[monthColumn] = 1
else:
regionTrainingDF[monthColumn] = 0
regionPredictionDF[monthColumn] = 0
# Append data to accumulator DataFrames
allTrainingDF = allTrainingDF.append(regionTrainingDF, ignore_index=True)
# CoastMnt's DataFrame being empty means it has a different column order (HUC12 in the middle rather than at the
# end), so don't append it, because the different column orders mess up the output.
if region != 'CoastMnt':
monthPredictionDF = monthPredictionDF.append(regionPredictionDF, ignore_index=True)
# Write month's prediction data out to file (too big to deal with all months in one csv)
predictionFilePath = sacBasePath + '/Prediction/sacramentoData_' + month + '.csv'
monthPredictionDF.to_csv(predictionFilePath, index=False)
# Write all Sacramento training data out to file
sacTrainingDataFilePath = sacBasePath + 'Sacramento_Basin.csv'
allTrainingDF.to_csv(sacTrainingDataFilePath, index=False)