eia_scrape/database_interface.py at master · RAEL-Berkeley/eia_scrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2017. All rights reserved. See AUTHORS.txt
# Licensed under the Apache License, Version 2.0 which is in LICENSE.txt
# Modified 2020 by Julia Szinai
"""
Defines several functions to finish processing EIA data and upload to the
Switch-WECC database. Some functions may be used for other purposes.

"""

import os, sys
import pandas as pd
import numpy as np
import getpass

import matplotlib.pyplot as plt
plt.switch_backend('agg')

from IPython import embed


from utils import connect_to_db_and_run_query, append_historic_output_to_csv, connect_to_db_and_push_df

coal_codes = ['ANT','BIT','LIG','SGC','SUB','WC','RC']
# explanation of coal status codes:
# ANT = Anthracite Coal, BIT = Bituminous Coal, LIG = Lignite Coal, SGC =
# Coal-Derived Synthesis Gas, SUB = Subbituminous Coal, WC = Waste/Other Coal, RC =Recirculating cooling
outputs_directory = 'processed_data'
# Disable false positive warnings from pandas
pd.options.mode.chained_assignment = None

#generation_plant_scenario_id and generation_plant_existing_and_planned_scenario_id including individual generation plants is 19 and aggregated version of the same scenario is 20
new_disaggregated_gen_scenario_id = 19.0
new_aggregated_gen_scenario_id = 20.0

#new hydro_simple_id (old hydro simple scenario id = 2 or 3)
new_disaggregated_hydro_simple_scenario_id = 19.0
new_aggregated_hydro_simple_scenario_id = 20.0

#new generation_plant_cost_id (old generation_plant_cost_id = 2 or 3)
new_disggregated_generation_plant_cost_id = 19.0
new_aggregated_generation_plant_cost_id = 20.0

#if testing code, run the script on backup tables first, which are defined with a PREFIX in the table name, otherwise is run on the main tables
TESTING_ON_BACKUP_TABLES = False

if TESTING_ON_BACKUP_TABLES:
    PREFIX = 'jsz_backup_'
else:
    PREFIX = ''

def pull_generation_projects_data(gen_scenario_id):
    """
    Returns generation plant data for a specific existing and planned scenario id.
    For now, only used to compare the old AMPL dataset with new heat rates.

    """

    print "Reading in existing and planned generation project data from database..."
    query = "SELECT * \
            FROM {PREFIX}generation_plant JOIN {PREFIX}generation_plant_existing_and_planned \
            USING (generation_plant_id) \
            WHERE generation_plant_existing_and_planned_scenario_id = {gen_scenario_id}".format(PREFIX=PREFIX, gen_scenario_id=gen_scenario_id)
    db_gens = connect_to_db_and_run_query(query=query, database='switch_wecc')
    print "======="
    print "Read in {} projects from the database for id {}, with {:.0f} GW of capacity".format(
        len(db_gens), gen_scenario_id, db_gens['capacity'].sum()/1000.0)
    thermal_db_gens = db_gens[db_gens['full_load_heat_rate'] > 0]
    print "Weighted average of heat rate: {:.3f} MMBTU/MWh".format(
        thermal_db_gens['capacity'].dot(thermal_db_gens['full_load_heat_rate'])/thermal_db_gens['capacity'].sum())
    print "======="

    return db_gens

def compare_generation_projects_scenario_data_by_energy_source(old_gen_scenario_id, new_gen_scenario_id):
    """
    Returns generation plant data for a prior existing and planned scenario id and compares with generation plant data for new added scenario,
    grouping by gen_tech and energy source

    Use this function to compare generation_plant_existing_and_planned_scenario_id=2 (2015 EIA data)
    with new generation_plant_existing_and_planned_scenario_id from the 2018 EIA data update

    """
    energy_source_list = ["Bio_Gas", "Wind","Waste_Heat","Coal","Solar","Bio_Solid","DistillateFuelOil","Uranium" ,"Gas" ,"Water","ResidualFuelOil","Geothermal","Bio_Liquid"]
    wecc_states = ['AZ','CA','CO','ID','MT','NV','NM','OR','TX','UT','WA','WY']

    print "Query of existing and planned generation project capacity by energy source from database from generation_plant_existing_and_planned_scenario_id {old_gen_scenario_id}...".format(old_gen_scenario_id=old_gen_scenario_id)

    query = "SELECT SUM(capacity) as total_capacity_limit_mw, energy_source, gen_tech \
            FROM {PREFIX}generation_plant \
            JOIN {PREFIX}generation_plant_existing_and_planned \
            USING (generation_plant_id) \
            WHERE generation_plant_existing_and_planned_scenario_id = {gen_scenario_id} \
            GROUP BY energy_source, gen_tech \
            ORDER BY energy_source, gen_tech".format(PREFIX=PREFIX, gen_scenario_id = old_gen_scenario_id)
    db_compare_gens_old_scenario = connect_to_db_and_run_query(query=query, database='switch_wecc')

    print "Output into CSV the query result of total nameplate capacity by state and energy source for generation_plant_existing_and_planned_scenario_id {old_gen_scenario_id}...".format(old_gen_scenario_id=old_gen_scenario_id)

    fpath = os.path.join('Nameplate capacity by energy source for gen plant scenario {old_gen_scenario_id}.tab').format(old_gen_scenario_id=old_gen_scenario_id)
    with open(fpath, 'w') as outfile:
        db_compare_gens_old_scenario.to_csv(outfile, sep='\t', header=True, index=False)

    print "Query of existing and planned generation project capacity by energy source from database from generation_plant_existing_and_planned_scenario_id {new_gen_scenario_id}...".format(new_gen_scenario_id=old_gen_scenario_id)

    query = "SELECT SUM(capacity) as total_capacity_limit_mw, energy_source, gen_tech \
            FROM {PREFIX}generation_plant \
            JOIN {PREFIX}generation_plant_existing_and_planned \
            USING (generation_plant_id) \
            WHERE generation_plant_existing_and_planned_scenario_id = {gen_scenario_id} \
            GROUP BY energy_source, gen_tech \
            ORDER BY energy_source, gen_tech".format(PREFIX=PREFIX, gen_scenario_id = new_gen_scenario_id)
    db_compare_gens_new_scenario = connect_to_db_and_run_query(query=query, database='switch_wecc')

    print "Output into CSV the query result of total nameplate capacity by state and energy source for generation_plant_existing_and_planned_scenario_id {new_gen_scenario_id}...".format(new_gen_scenario_id=new_gen_scenario_id)

    fpath = os.path.join('Nameplate capacity by energy source for gen plant scenario {new_gen_scenario_id}.tab').format(new_gen_scenario_id=new_gen_scenario_id)
    with open(fpath, 'w') as outfile:
        db_compare_gens_new_scenario.to_csv(outfile, sep='\t', header=True, index=False)

    compare_old_new_scenarios = pd.merge(db_compare_gens_new_scenario, db_compare_gens_old_scenario, how='left', on=['energy_source', 'gen_tech'], suffixes=('_new','_old'))

    compare_old_new_scenarios['scenario_diff_mw'] = compare_old_new_scenarios['total_capacity_limit_mw_new'] - compare_old_new_scenarios['total_capacity_limit_mw_old']

    fpath = os.path.join('Compare capacity by energy source for new and old gen plant scenarios.tab')
    with open(fpath, 'w') as outfile:
        compare_old_new_scenarios.to_csv(outfile, sep='\t', header=True, index=False)


    return db_compare_gens_old_scenario, db_compare_gens_new_scenario

def filter_plants_by_region_id(region_id, year, host='switch-db2.erg.berkeley.edu', area=0.5):
    """
    Filters generation plant data by NERC Region, according to the provided id.
    Generation plants w/o Region get assigned to the NERC Region with which more
    than a certain percentage of its County area intersects (by default, 50%).
    A list is saved with Counties and States belonging to the specified Region.
    Both County and State are necessary to correctly assign plants (some County
    names exist in multiple States).

    Returns a DataFrame with the filtered data.

    """

    state_dict = {
        'Alabama':'AL',
        'Alaska':'AK',
        'Arizona':'AZ',
        'Arkansas':'AR',
        'California':'CA',
        'Colorado':'CO',
        'Connecticut':'CT',
        'Delaware':'DE',
        'Florida':'FL',
        'Georgia':'GA',
        'Hawaii':'HI',
        'Idaho':'ID',
        'Illinois':'IL',
        'Indiana':'IN',
        'Iowa':'IA',
        'Kansas':'KS',
        'Kentucky':'KY',
        'Louisiana':'LA',
        'Maine':'ME',
        'Maryland':'MD',
        'Massachusetts':'MA',
        'Michigan':'MI',
        'Minnesota':'MN',
        'Mississippi':'MS',
        'Missouri':'MO',
        'Montana':'MT',
        'Nebraska':'NE',
        'Nevada':'NV',
        'New Hampshire':'NH',
        'New Jersey':'NJ',
        'New Mexico':'NM',
        'New York':'NY',
        'North Carolina':'NC',
        'North Dakota':'ND',
        'Ohio':'OH',
        'Oklahoma':'OK',
        'Oregon':'OR',
        'Pennsylvania':'PA',
        'Rhode Island':'RI',
        'South Carolina':'SC',
        'South Dakota':'SD',
        'Tennessee':'TN',
        'Texas':'TX',
        'Utah':'UT',
        'Vermont':'VT',
        'Virginia':'VA',
        'Washington':'WA',
        'West Virginia':'WV',
        'Wisconsin':'WI',
        'Wyoming':'WY'
    }

    #getting abbreviated name (regionabr) of NERC region from db (from switch_gis.public schema)
    print "Getting NERC region name from database..."
    query = "SELECT regionabr FROM ventyx_nerc_reg_region WHERE gid={}".format(
        region_id)
    region_name = connect_to_db_and_run_query(query=query,
        database='switch_gis', host=host)['regionabr'][0]

    #read in existing file with list of counties in each state in WECC or if file doesn't exist,
    # assign county to state and WECC region if input % of area falls into region
    counties_path = os.path.join('other_data', '{}_counties.tab'.format(region_name))
    if not os.path.exists(counties_path):
        # assign county if (area)% or more of its area falls in the region
        query = "SELECT name, state\
                 FROM ventyx_nerc_reg_region regions CROSS JOIN us_counties cts\
                 JOIN (SELECT DISTINCT state, state_fips FROM us_states) sts \
                 ON (sts.state_fips=cts.statefp) \
                 WHERE regions.gid={region_id} AND\
                 ST_Area(ST_Intersection(cts.the_geom, regions.the_geom))/\
                 ST_Area(cts.the_geom)>={area}".format(PREFIX=PREFIX, region_id=region_id, area=area)
        print "\nGetting counties and states for the region from database..."
        region_counties = pd.DataFrame(connect_to_db_and_run_query(query=query,
            database='switch_gis', host=host)).rename(columns={'name':'County','state':'State'})
        region_counties.replace(state_dict, inplace=True)
        region_counties.to_csv(counties_path, sep='\t', index=False)
    else:
        print "Reading counties from .tab file..."
        region_counties = pd.read_csv(counties_path, sep='\t', index_col=None)

    #reading in the processed generator project data from scrape.py from EIA 860 forms for each year
    generators = pd.read_csv(
        os.path.join('processed_data','generation_projects_{}.tab'.format(year)), sep='\t')
    generators.loc[:,'County'] = generators['County'].map(lambda c: str(c).title())

    print "\nRead in data for {} generators, of which:".format(len(generators))
    print "--{} are existing".format(len(generators[generators['Operational Status']=='Operable']))
    print "--{} are proposed".format(len(generators[generators['Operational Status']=='Proposed']))

    #if generators don't have a NERC region already from the EIA data, assign region based on join on county and state
    generators_with_assigned_region = generators.loc[generators['Nerc Region'] == region_name]
    generators = generators[generators['Nerc Region'].isnull()]
    generators_without_assigned_region = pd.merge(generators, region_counties, how='inner', on=['County','State'])
    generators = pd.concat([
        generators_with_assigned_region,
        generators_without_assigned_region],
        axis=0)
    generators.replace(
            to_replace={'Energy Source':coal_codes, 'Energy Source 2':coal_codes,
            'Energy Source 3':coal_codes}, value='COAL', inplace=True)
    generators_columns = list(generators.columns)

    existing_gens = generators[generators['Operational Status']=='Operable']
    proposed_gens = generators[generators['Operational Status']=='Proposed']

    print "======="
    print "Filtered to {} projects in the {} region, of which:".format(
        len(generators), region_name)
    print "--{} are existing with {:.0f} GW of capacity".format(
        len(existing_gens), existing_gens['Nameplate Capacity (MW)'].sum()/1000.0)
    print "--{} are proposed with {:.0f} GW of capacity".format(
        len(proposed_gens), proposed_gens['Nameplate Capacity (MW)'].sum()/1000.0)
    print "======="

    return generators


def compare_eia_heat_rates_to_ampl_projs(year):
    """
    Compares calculated 'Best Heat Rates' for EIA plants with full load heat
    rates of previously stored Switch AMPL data (generation scenario id 1) in
    the database.

    Returns the comparison DataFrame and prints it to a tab file.
    """

    db_gen_projects = pull_generation_projects_data(gen_scenario_id=1).rename(
        columns={'name':'Plant Name', 'gen_tech':'Prime Mover'})
    db_gen_projects.loc[:,'Prime Mover'].replace(
        {
        'Coal_Steam_Turbine':'ST',
        'Gas_Steam_Turbine':'ST',
        'Gas_Combustion_Turbine':'GT',
        'Gas_Combustion_Turbine_Cogen':'GT',
        'CCGT':'CC',
        'DistillateFuelOil_Combustion_Turbine':'GT',
        'DistillateFuelOil_Internal_Combustion_Engine':'IC',
        'Geothermal':'ST',
        'Gas_Internal_Combustion_Engine':'IC',
        'Bio_Gas_Internal_Combustion_Engine':'IC',
        'Bio_Gas_Steam_Turbine':'ST'
        },
        inplace=True)
    eia_gen_projects = filter_plants_by_region_id(13, year) #region 13 is WECC

    df = pd.merge(db_gen_projects, eia_gen_projects,
        on=['Plant Name','Prime Mover'], how='left').loc[:,[
        'Plant Name','gen_tech','energy_source','full_load_heat_rate',
        'Best Heat Rate','Prime Mover','Energy Source','Energy Source 2','Operating Year']]
    df = df[df['full_load_heat_rate']>0]

    print "\nPrinting intersection of DB and EIA generation projects that have a specified heat rate to heat_rate_comparison.tab"

    fpath = os.path.join('processed_data','heat_rate_comparison.tab')
    with open(fpath, 'w') as outfile:
        df.to_csv(outfile, sep='\t', header=True, index=False)

    # Added a merge with 'best heat rate column'
    eia_best_historic_heat_rate = pd.read_csv(
        os.path.join('processed_data','historic_heat_rates_WIDE.tab', sep='\t'))
    eia_best_historic_heat_rate = eia_best_historic_heat_rate[eia_best_historic_heat_rate['Year'] == year]

    df2 = pd.merge(db_gen_projects, eia_best_historic_heat_rate,
        on=['Plant Name','Prime Mover'], how='left').loc[:,[
        'Plant Name','gen_tech','energy_source','full_load_heat_rate',
        'Best Heat Rate','Prime Mover','Energy Source','Energy Source 2','Year']]
    df2 = df2[df2['full_load_heat_rate']>0]

    print "\nPrinting intersection of DB and EIA generation projects that have a specified heat rate to heat_rate_comparison.tab"

    fpath = os.path.join('processed_data','heat_rate_comparison_wide_test.tab')
    with open(fpath, 'w') as outfile:
        df2.to_csv(outfile, sep='\t', header=True, index=False)

    return df


def assign_heat_rates_to_projects(generators, year):
    """
    Creates uniform fuel list based on https://www.seia.org/sites/default/files/EIA-860.pdf

    Assigns calculated heat rates based on EIA923 data to plants parsed from
    EIA860 data. Receives a DataFrame with all generators and the year.

    Coal plants with better heat rates than 8.607 MMBTU/MWh (still need to add
    the reference to this best historic heat rate of 2015) and other thermal
    plants with heat rate better (lower) than 6.711 MMBTU/MWh are ignored and get
    assigned an average heat rate, since we assume a report error has taken place.

    Modified to also ignore heat rates that are too high (too bad to be realistic)
    because they are 1 order of magnitude too high (greater than 100 MMBTU/MWh)

    Average HR by energy source in recent years here: https://www.eia.gov/electricity/annual/html/epa_08_01.html

    The top and bottom .5% of heat rates get replaced by the heat rate at the
    top and bottom .5 percentile, respectively. This replaces unrealistic and missing values
    that must have been caused by reporting errors.

    Heat rate averages used to replace unrealistic values and to be assigned to
    projects without heat rate are calculated as the average heat rate of plants
    with the same technology, energy source and vintage. A 4-year window is used
    to identify plants with similar vintage. If fewer than 4 plants fall into this
    window, it is enlarged successively. If no other project with the same
    technology-energy source combination exists, then the technology's average
    heat rate is used. The last two assignments (per technology-energy source-window
    if other projects exist, and per technology is no other projects exist) are
    applied to both existing projects without heat rate data and to new projects.

    Heat rate distributions per technology and energy source are plotted and
    printed to a PDF file in order to visually inspect them.

    Returns the original DataFrame with a Best Heat Rate column.

    """

    fuels = {
        'LFG':'Bio_Gas', #landfill gas
        'OBG':'Bio_Gas', #other biomass gas
        'AB':'Bio_Solid', #agricultural by-product
        'BLQ':'Bio_Liquid', #black liquor
        'NG':'Gas', #natural gas
        'OG':'Gas', #other gas
        'PG':'Gas', #propane
        'DFO':'DistillateFuelOil', #distillate fuel oil
        'JF':'ResidualFuelOil', #jet fuel
        'COAL':'Coal',
        'GEO':'Geothermal', #geothermal
        'NUC':'Uranium', #nuclear
        'PC':'Coal', #Petroleum Coke
        'SUN':'Solar', #solar
        'WDL':'Bio_Liquid', #wood waste liquids
        'WDS':'Bio_Solid', #wood waste solids
        'MSW':'Bio_Solid', #municipal solid waste
        'PUR':'Purchased_Steam', #purchased steam
        'WH':'Waste_Heat', #Waste heat not directly attributed to a fuel source
        'OTH':'Other', #other
        'WAT':'Water', #water (hydro)
        'MWH':'Electricity', #Electricity used for energy storage
        'WND':'Wind' #wind
    }
    generators = generators.replace({'Energy Source':fuels})

    existing_gens = generators[generators['Operational Status']=='Operable']
    print "-------------------------------------"
    print "There are {} existing operable thermal projects that sum up to {:.1f} GW.".format(
        len(existing_gens[existing_gens['Prime Mover'].isin(['CC','GT','IC','ST'])]),
        existing_gens[existing_gens['Prime Mover'].isin(['CC','GT','IC','ST'])][
            'Nameplate Capacity (MW)'].sum()/1000)

    #reading in previously processed historic heat rate
    heat_rate_data = pd.read_csv(
        os.path.join('processed_data','historic_heat_rates_WIDE.tab'), sep='\t').rename(
        columns={'Plant Code':'EIA Plant Code'})
    heat_rate_data = heat_rate_data[heat_rate_data['Year']==year]
    heat_rate_data = heat_rate_data.replace({'Energy Source':fuels})
    thermal_gens = pd.merge(
        existing_gens, heat_rate_data[['EIA Plant Code','Prime Mover','Energy Source','Best Heat Rate']],
        how='left', suffixes=('',''),
        on=['EIA Plant Code','Prime Mover','Energy Source']).drop_duplicates()
    thermal_gens = thermal_gens[thermal_gens['Prime Mover'].isin(['CC','GT','IC','ST'])]

    # Replace null and unrealistic heat rates by average values per technology,
    # fuel, and vintage. Also, set HR of top and bottom .5% to max and min
    null_heat_rates = thermal_gens['Best Heat Rate'].isnull()
    unrealistic_heat_rates = (((thermal_gens['Energy Source'] == 'Coal') &
            (thermal_gens['Best Heat Rate'] < 8.607)) |
        ((thermal_gens['Energy Source'] != 'Coal') &
            (thermal_gens['Best Heat Rate'] < 6.711)) |
            (thermal_gens['Best Heat Rate'] > 100)) # Additional criteria for upper outliers
    print "{} generators don't have heat rate data specified ({:.1f} GW of capacity)".format(
        len(thermal_gens[null_heat_rates]), thermal_gens[null_heat_rates]['Nameplate Capacity (MW)'].sum()/1000.0)
    print "{} generators have better heat rate than the best historical records ({} GW of capacity)".format(
        len(thermal_gens[unrealistic_heat_rates]), thermal_gens[unrealistic_heat_rates]['Nameplate Capacity (MW)'].sum()/1000.0)
    thermal_gens_w_hr = thermal_gens[~null_heat_rates & ~unrealistic_heat_rates]
    thermal_gens_wo_hr = thermal_gens[null_heat_rates | unrealistic_heat_rates]

    # Print fuels and technologies with missing HR to console

    # for fuel in thermal_gens_wo_hr['Energy Source'].unique():
    #     print "{} of these use {} as their fuel".format(
    #         len(thermal_gens_wo_hr[thermal_gens_wo_hr['Energy Source']==fuel]),fuel)
    #     print "Technologies:"
    #     for prime_mover in thermal_gens_wo_hr[thermal_gens_wo_hr['Energy Source']==fuel]['Prime Mover'].unique():
    #         print "\t{} use {}".format(
    #             len(thermal_gens_wo_hr[(thermal_gens_wo_hr['Energy Source']==fuel) &
    #                 (thermal_gens_wo_hr['Prime Mover']==prime_mover)]),prime_mover)

    print "-------------------------------------"
    print "Assigning max/min heat rates per technology and fuel to top .5% / bottom .5%, respectively:"
    n_outliers = int(len(thermal_gens_w_hr)*0.005)
    thermal_gens_w_hr = thermal_gens_w_hr.sort_values('Best Heat Rate')
    min_hr = thermal_gens_w_hr.loc[thermal_gens_w_hr.index[n_outliers],'Best Heat Rate']
    max_hr = thermal_gens_w_hr.loc[thermal_gens_w_hr.index[-1-n_outliers],'Best Heat Rate']
    print "(Total capacity of these plants is {:.1f} GW)".format(
        thermal_gens_w_hr[thermal_gens_w_hr['Best Heat Rate'] < min_hr]['Nameplate Capacity (MW)'].sum()/1000.0 +
        thermal_gens_w_hr[thermal_gens_w_hr['Best Heat Rate'] > max_hr]['Nameplate Capacity (MW)'].sum()/1000.0)
    print "Minimum heat rate is {:.3f}".format(min_hr)
    print "Maximum heat rate is {:.3f}".format(max_hr)
    for i in range(n_outliers):
        thermal_gens_w_hr.loc[thermal_gens_w_hr.index[i],'Best Heat Rate'] = min_hr
        thermal_gens_w_hr.loc[thermal_gens_w_hr.index[-1-i],'Best Heat Rate'] = max_hr

    #window = 2 means the average HR is assigned +/- 2 years, or a 4 year wide window
    def calculate_avg_heat_rate(thermal_gens_df, prime_mover, energy_source, vintage, window=2):
        similar_generators = thermal_gens_df[
            (thermal_gens_df['Prime Mover']==prime_mover) &
            (thermal_gens_df['Energy Source']==energy_source) &
            (thermal_gens_df['Operating Year']>=vintage-window) &
            (thermal_gens_df['Operating Year']<=vintage+window)]
        while len(similar_generators) < 4: # If fewer than 4 plants fall into this window, it is enlarged successively.
            window += 2
            similar_generators = thermal_gens_df[
                (thermal_gens_df['Prime Mover']==prime_mover) &
                (thermal_gens_df['Energy Source']==energy_source) &
                (thermal_gens_df['Operating Year']>=vintage-window) &
                (thermal_gens_df['Operating Year']<=vintage+window)]
            # thermal generator operating years span from 1925 to 2018, so a window of 103 years is the maximum
            if window >= 103:
                break
        if len(similar_generators) > 0:
            return similar_generators['Best Heat Rate'].mean()
        else:
            # If no other similar projects exist, return average of technology
            return thermal_gens_df[thermal_gens_df['Prime Mover']==prime_mover]['Best Heat Rate'].mean()


    print "-------------------------------------"
    print "Assigning average heat rates per technology, fuel, and vintage to projects w/o heat rate..."
    for idx in thermal_gens_wo_hr.index:
        pm = thermal_gens_wo_hr.loc[idx,'Prime Mover']
        es = thermal_gens_wo_hr.loc[idx,'Energy Source']
        v = thermal_gens_wo_hr.loc[idx,'Operating Year']
        #print "{}\t{}\t{}\t{}".format(pm,es,v,calculate_avg_heat_rate(thermal_gens_w_hr, pm, es, v))
        thermal_gens_wo_hr.loc[idx,'Best Heat Rate'] = calculate_avg_heat_rate(
            thermal_gens_w_hr, pm, es, v)

    thermal_gens = pd.concat([thermal_gens_w_hr, thermal_gens_wo_hr], axis=0)
    existing_gens = pd.merge(existing_gens, thermal_gens, on=list(existing_gens.columns), how='left')


    # Plot histograms for resulting heat rates per technology and fuel
    thermal_gens["Technology"] = thermal_gens["Energy Source"].map(str) + ' ' + thermal_gens["Prime Mover"]
    # Commented out because of a pandas update that caused an error with ggplot2. The associated ggplot plotting code (for diagnostics) is also commented out in the script below
    #from ggplot import *
    #import rpy2
    #from pandas import Timestamp
    #p = ggplot(aes(x='Best Heat Rate',fill='Technology'), data=thermal_gens) + geom_histogram(binwidth=0.5) + facet_wrap("Technology")  + ylim(0,30)
    #p.save(os.path.join(outputs_directory,'heat_rate_distributions.pdf'))

    #assigning average heat rate of technology for proposed generation based on calculated average HR of available HR from EIA data (2004-2018)
    proposed_gens = generators[generators['Operational Status']=='Proposed']
    thermal_proposed_gens = proposed_gens[proposed_gens['Prime Mover'].isin(['CC','GT','IC','ST'])]
    other_proposed_gens = proposed_gens[~proposed_gens['Prime Mover'].isin(['CC','GT','IC','ST'])]
    print "There are {} proposed thermal projects that sum up to {:.2f} GW.".format(
        len(thermal_proposed_gens), thermal_proposed_gens['Nameplate Capacity (MW)'].sum()/1000)
    print "Assigning average heat rate of technology and fuel of most recent years from EIA (2004-2018)..."
    for idx in thermal_proposed_gens.index:
        pm = thermal_proposed_gens.loc[idx,'Prime Mover']
        es = thermal_proposed_gens.loc[idx,'Energy Source']
        #print "{}\t{}\t{}\t{}".format(pm,es,v,calculate_avg_heat_rate(thermal_gens_w_hr, pm, es, v))
        thermal_proposed_gens.loc[idx,'Best Heat Rate'] = calculate_avg_heat_rate(
            thermal_gens_w_hr, pm, es, year)

    other_proposed_gens['Best Heat Rate'] = float('nan')
    proposed_gens = pd.concat([thermal_proposed_gens,other_proposed_gens], axis=0)

    return pd.concat([existing_gens, proposed_gens], axis=0)


def finish_project_processing(year):
    """
    Receives a year, and processes the scraped EIA data for that year by using
    previously defined functions.

    The year that should be input is the most recent year of EIA data. At the time
    of updating this script (Aug 2020), 2018 was the the most recent available vintage
    of "final" (not preliminary) EIA data.

    First, plants are read in from the generation_projects_YEAR.tab file, which
    come from the EIA860 form, and filtered by region. For now, region 13 (WECC)
    is hardcoded.

    Second, plants are assigned heat rates from the historic_heat_rates_WIDE.tab
    file, which come from the EIA923 form. Plants with missing heat rates are
    assigned averages, and unrealistic heat rate values are replaced by reasonable
    parameters.

    Prints out 3 tab files with resulting data:
        existing_generation_projects_YEAR.tab
        new_generation_projects_YEAR.tab
        uprates_to_generation_projects_YEAR.tab

    These files are later post-processed and pushed into the Switch-WECC database
    of RAEL (UC Berkeley), though data is formatted in a general-purpose manner,
    so it could be used for any other purpose.

    """
    #assign generators to NERC regions and filter list just to WECC generators in given year
    generators = filter_plants_by_region_id(13, year)
    #assign average heat rates from similar vintage and technology to thermal
    # generators with missing or unrealistic heat rates
    generators = assign_heat_rates_to_projects(generators, year)
    existing_gens = generators[generators['Operational Status']=='Operable']
    proposed_gens = generators[generators['Operational Status']=='Proposed']

    #output to CSV the list of existing generation projects that have been processed for the given year
    fname = 'existing_generation_projects_{}.tab'.format(year)
    with open(os.path.join(outputs_directory, fname),'w') as f:
        existing_gens.to_csv(f, sep='\t', encoding='utf-8', index=False)

    uprates = pd.DataFrame()
    new_gens = pd.DataFrame()
    for idx in proposed_gens.index:
        pc = proposed_gens.loc[idx,'EIA Plant Code']
        pm = proposed_gens.loc[idx,'Prime Mover']
        es = proposed_gens.loc[idx,'Energy Source']
        existing_units_for_proposed_gen = existing_gens[
        (existing_gens['EIA Plant Code'] == pc) &
        (existing_gens['Prime Mover'] == pm) &
        (existing_gens['Energy Source'] == es)]
        if len(existing_units_for_proposed_gen) == 0:
            new_gens = pd.concat([new_gens, pd.DataFrame(proposed_gens.loc[idx,:]).T], axis=0)
        elif len(existing_units_for_proposed_gen) == 1:
            uprates = pd.concat([uprates, pd.DataFrame(proposed_gens.loc[idx,:]).T], axis=0)
        else:
            print "There is more than one option for uprating plant id {}, prime mover {} and energy source {}".format(int(pc), pm, es)

    #output to CSV the list of proposed generation projects that have been processed for the given year
    fname = 'new_generation_projects_{}.tab'.format(year)
    with open(os.path.join(outputs_directory, fname),'w') as f:
        new_gens.to_csv(f, sep='\t', encoding='utf-8', index=False)

    fname = 'uprates_to_generation_projects_{}.tab'.format(year)
    with open(os.path.join(outputs_directory, fname),'w') as f:
        uprates.to_csv(f, sep='\t', encoding='utf-8', index=False)


def upload_generation_projects(year):
    """
    Reads existing and new project data previously processed from the EIA forms
    in order to upload it to the Switch-WECC database of RAEL, at UC Berkeley.

    First, generation project data is read in from the processed tab files.

    Projects using Purchased Steam as their energy source are
    dropped from the generator set.

    Projects using Electricity as their energy source were previously also
    dropped from the generator set. But given the growing share of batteries in
    the capacity mix (presumably to meet the CA storage mandate), Batteries are removed
    from the "ignored" list and included in the list of existing and
    proposed generation.

    The list of retired plants in WECC that are still in the generator list is read in
    from the processed tab files. This list is joined with the processed plant-level list above
    and the retired nameplate capacity is subtracted. If the remaining capacity is 0, the
    plant is dropped from the list before uploading to the database.

    Projects using Other as their energy source are assigned Gas as default.

    Capacity limits are set as total existing and projected capacity for each
    project (e.g. no additional capacity additions will be allowed for
    predetermined projects in Switch).

    Plant-level heat rates are calculated by doing a capacity-weighted average
    over the individual heat rates of each unit in the plant that have the same
    technology and use the same energy source. This allows obtaining a single
    heat rate for plants with units that have different vintages.

    Baseload flags are set for all plants that use Nuclear, Coal, or Geothermal
    as their energy source.

    Variable flags are set for all plants that use Hydro, Photovoltaic, or
    Wind Turbine technologies.

    Cogen flags are set for all plants that declared being Cogen.

    Columns are renamed to match the PSQL database column definitions.

    Resulting generation plant data is uploaded to the database with generation
    plant scenario id 19 for 2018 vintage EIA data (previously was scenario 2 for
    2015 vintage EIA data). A subsequent aggregated set per technology, energy source,
    and load zone is uploaded with id 20 for 2018 vintage EIA data (previously
    was scenario 3 for 2015 vintage EIA data).

    WARNING: The upload process will clean the database from all previous projects
    with the same scenario ids (previously 2 and 3, now 19 and 20). This includes:
        Hydro capacity factors
        Plant cost
        Plant build years
        Plant scenario members
        Plant level data
        But not variable capacity factor data (that was uploaded after finishing
            this part of the code, so its still in the todo list).

    After uploading generation plant data, the geom column is populated with
    the geometric object representing the location of the project, for those
    projects with latitude and longitude defined.

    Then, plants are assigned to load zones:
        Plants with geom data are assgined to zones into which their location
        falls in.
        Plants without lat and long data are assigned to the load zone in which
        their County's centroid falls in.
        Plants with coordinates out of the WECC region (only a few) are assigned
        to the closest WECC load zone if they are within a 100 mile radius from
        its boundary. Otherwise, they are dropped from the data set (for now,
        only a couple of cases in the East Coast, which must have a reporting
        mistake).

    Outage rates, and variable O&M costs are assigned as
    technology-default values. For battery_storage gen_tech these technology-default
    values are copied into the technology default table from that of proposed
    battery storage in another scenario

    For the generators that have planned retirements, the max age is
    set to the planned retirement year - operating year. For all other generators,
    a technology-default value is assigned for the max age.

    Uploaded plants are assigned to generation plant scenario id 19 (was 2).

    The uploaded generation plant ids are recovered, so that build year data
    can be uploaded for existing and new projects.

    Fixed and investment costs are assigned a default value of 0 to all plants.

    Hydro capacity factors are uploaded for each hydro plant, according to
    nameplate capacity. Minimum flows are set to a default of 0.5 times the
    average flow. The hydro scenario id is set to 19 (was 2).

    The plant dataset is then aggregated by technology, energy source, and load
    zone, considering heat rate windows of 1 MMBTU/MWh (so that plants with
    significantly different heat rates are not lumped in together). Heat rates
    are averaged by weighting the capacity of each plant. Other properties,
    such as capacity limit, are simply summed.

    In the 2020 update, the dataset is uploaded with id 20 (was 3 in 2017),
    and build years, hydro capacity factors, and all other data is processed
    in the same way as for id 19 (was 2 in 2017).

    The scenario "mapping" tables (generation_plant_scenario,
    hydro_simple_scenario, generation_plant_cost_scenario, generation_plant_existing_and_planned_scenario)
     are updated to include the new scenario ids and scenario description

    """
    try:
        user = os.environ['SWITCH_USERNAME']
        password = os.environ['SWITCH_PASSWORD']
    except KeyError:
        user = getpass.getpass('Enter username for the database:')
        password = getpass.getpass('Enter database password for user {}:'.format(user))
    def read_output_csv(fname):
        try:
            return pd.read_csv(os.path.join(outputs_directory,fname), sep='\t', index_col=None)
        except:
            print "Failed to read file {}. It will be considered to be empty.".format(fname)
            return None

    existing_gens = read_output_csv('existing_generation_projects_{}.tab'.format(year))
    new_gens = read_output_csv('new_generation_projects_{}.tab'.format(year))
    uprates = read_output_csv('uprates_to_generation_projects_{}.tab'.format(year))
    if uprates is not None:
        print "Read data for {} existing projects, {} new projects, and {} uprates".format(
            len(existing_gens), len(new_gens), len(uprates))
        print "Existing capacity: {:.2f} GW".format(existing_gens['Nameplate Capacity (MW)'].sum()/1000.0)
        print "Proposed capacity: {:.2f} GW".format(new_gens['Nameplate Capacity (MW)'].sum()/1000.0)
        print "Capacity uprates: {:.2f} GW".format(uprates['Nameplate Capacity (MW)'].sum()/1000.0)
    else:
        print "Read data for {} existing projects and {} new projects".format(
            len(existing_gens), len(new_gens))
        print "Existing capacity: {:.2f} GW".format(existing_gens['Nameplate Capacity (MW)'].sum()/1000.0)
        print "Proposed capacity: {:.2f} GW".format(new_gens['Nameplate Capacity (MW)'].sum()/1000.0)

    generators = pd.concat([existing_gens, new_gens], axis=0)

    # Batteries were previously included on the list of ignored energy sources. But there are existing
    # batteries on the system, and as of the 2018 vintage EIA data about 800MW of batteries that are proposed.
    # So I have removed batteries from the list of ignored projects because it is a significant capacity amount
    # (to meet CA storage mandate)

    ignore_energy_sources = ['Purchased_Steam']
    #ignore_energy_sources = ['Purchased_Steam','Electricity']

    print ("Dropping projects that use Purchased Steam, since these"
    " are not modeled in Switch, totalizing {:.2f} GW of capacity").format(
        generators[generators['Energy Source'].isin(
            ignore_energy_sources)]['Nameplate Capacity (MW)'].sum()/1000.0)
    print "Replacing 'Other' for 'Gas' as energy source for {:.2f} GW of capacity".format(
        generators[generators['Energy Source'] == 'Other'][
            'Nameplate Capacity (MW)'].sum()/1000.0)
    generators.drop(generators[generators['Energy Source'].isin(
            ignore_energy_sources)].index, inplace=True)
    generators.replace({'Energy Source':{'Other':'Gas'}}, inplace=True)

    # Reading in the previously processed list of generators in WECC states that are retired or have
    # planned retirements, but are still in the list of existing or planned generation projects in WECC states.
    # This list of retired generators has had its capacity aggregated to the plant level by energy source, prime mover, and
    # operating year.

    retired_gens = read_output_csv('retired_WECC_aggregated_generation_projects_{}.tab'.format(year))

    retired_gens = retired_gens.rename(columns = {'Nameplate Capacity (MW)':'retired_capacity_mw'})

    print "Joining the aggregated capacity by plant with retired capacity by plant..."

    #join the aggregated (by plant) retired generator projects with the aggregated existing generator projects (by plant)
    index_cols = ['EIA Plant Code','Prime Mover', 'State','County', 'Operating Year']

    generators_and_retired = pd.merge(generators, retired_gens, on=index_cols, how='left')

    #subtract out the retired nameplate capacity from the aggregated existing generator capacity
    generators_and_retired['net_operating_capacity_limit_mw'] = generators_and_retired['Nameplate Capacity (MW)']- generators_and_retired['retired_capacity_mw']

    #drop generators entirely if the remaining nameplate capacity = 0 after retirements are subtracted out
    generators_no_retired = generators_and_retired[generators_and_retired['net_operating_capacity_limit_mw'] != 0]

    #for several instances where only a portion of the nameplate capacity is retired, the Nameplate Capcity
    # column is replaced with this difference value of remaining capacity
    generators_no_retired['Nameplate Capacity (MW)'] = np.where(generators_no_retired['net_operating_capacity_limit_mw'] > 0, generators_no_retired['net_operating_capacity_limit_mw'], generators_no_retired['Nameplate Capacity (MW)'])

    print ("Dropping {} projects from generator list that have since been retired, totaling {:.2f} GW of capacity").format(
        len(generators_and_retired) - len(generators_no_retired),sum(generators_and_retired['retired_capacity_mw'])/1000.0)

    #calculating the "max_age" parameter for generators that are still operating but have a planned retirement date as
    #the Planned Retirement Year - Operating Year. If no retirement year not >0, make max age = 0. This will be replaced by techology default values in the database

    #generators_no_retired = generators_no_retired.astype({'Planned Retirement Year': 'int64', 'Operating Year':'int64'})
    generators_no_retired['Planned Retirement Year'][generators_no_retired['Planned Retirement Year'] == ' '] = 0
    generators_no_retired = generators_no_retired.astype({'Planned Retirement Year': 'float'})

    generators_no_retired['max_age'] = np.where(generators_no_retired['Planned Retirement Year'] > 0, generators_no_retired['Planned Retirement Year'] - generators_no_retired['Operating Year'], 0)

    generators_no_retired = generators_no_retired.astype({'max_age': 'int64'})

    #output to CSV the list of generators without retirements
    fname = 'WECC_non_retired_generation_projects_{}.tab'.format(year)
    with open(os.path.join(outputs_directory, fname),'w') as f:
        generators_no_retired.to_csv(f, sep='\t', encoding='utf-8', index=False)
        print "Saved data to {} file.\n".format(fname)

    #output to CSV the list of generators with retirements still flagged
    fname= 'WECC_generators_and_retired_projects_{}.tab'.format(year)
    with open(os.path.join(outputs_directory, fname),'w') as f:
        generators_and_retired.to_csv(f, sep='\t', encoding='utf-8', index=False)
        print "Saved data to {} file.\n".format(fname)

    #Dropping the unnecssary columns and renaming the dataframe back to "generators" now that the capacity of retired generators has been removed
    generators_no_retired = generators_no_retired.rename(columns={'Plant Name_x':'Plant Name'})
    generators_no_retired = generators_no_retired.drop(['Plant Name_y','retired_capacity_mw','Regulatory Status','net_operating_capacity_limit_mw'], axis=1)

    generators = generators_no_retired

    def weighted_avg(group, avg_name, weight_name):
        """
        Plant-level heat rates are calculated by doing a capacity-weighted average
        over the individual heat rates of each unit in the plant that have the same
        technology and use the same energy source. This allows obtaining a single
        heat rate for plants with units that have different vintages.

        http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
        """
        d = group[avg_name]
        w = group[weight_name]
        try:
            return (d * w).sum() / w.sum()
        except ZeroDivisionError:
            return d.mean()


    index_cols = ['EIA Plant Code','Prime Mover','Energy Source']
    print "Calculating capacity-weighted average heat rates per plant, technology and energy source..."
    generators = pd.merge(generators,
        pd.DataFrame(generators.groupby(index_cols).apply(weighted_avg, 'Best Heat Rate',
        'Nameplate Capacity (MW)')).reset_index().replace(0, float('nan')),
        how='right',
        on=index_cols).drop('Best Heat Rate', axis=1)

    print "Calculating maximum capacity limits per plant, technology and energy source..."
    gb = generators.groupby(index_cols)
    agg_generators = gb.agg({col:sum if col == 'Nameplate Capacity (MW)' else 'max'
                                    for col in generators.columns}).rename(columns=
                                    {'Nameplate Capacity (MW)':'capacity_limit_mw'}).reset_index(drop=True)
    generators = pd.merge(generators, agg_generators[index_cols+['capacity_limit_mw']],
        on=index_cols, how='right').reset_index(drop=True)

    print "Assigning baseload, variable and cogen flags..."
    generators.loc[:,'is_baseload'] = np.where(generators['Energy Source'].isin(
        ['Nuclear','Coal','Geothermal']),True,False)
    generators.loc[:,'is_variable'] = np.where(generators['Prime Mover'].isin(
        ['HY','PV','WT']),True,False)
    if 'Cogen' not in generators.columns:
        generators.loc[:,'is_cogen'] = False
    else:
        generators.loc[:,'is_cogen'] = np.where(generators['Cogen'] == 'Y',True,False)

    database_column_renaming_dict = {
        'EIA Plant Code':'eia_plant_code',
        'Plant Name':'name',
        'Prime Mover':'gen_tech',
        'Energy Source':'energy_source',
        0:'full_load_heat_rate',
        'Operating Year':'build_year',
        'Nameplate Capacity (MW)':'capacity',
        'max_age':'max_age'
        }

    generators.rename(columns=database_column_renaming_dict, inplace=True)

    generators.replace(' ',float('nan'), inplace=True)

    #round full load heat rate column to 3 decimal places
    generators['full_load_heat_rate'] = generators['full_load_heat_rate'].round(decimals=3)

    #rename battery storage gen_tech to match database naming convention
    generators['gen_tech'] = np.where(generators['gen_tech'] == 'BA', 'Battery_Storage', generators['gen_tech'])

    carry_on = getpass.getpass('WARNING: In order to push projects into the DB,'
        'all projects currently in the generation_plant table that are'
        'not present in the generation_plant_scenario_member table will be'
        'removed. Continue? [y/n]')
    while carry_on not in ['y','n']:
        carry_on = getpass.getpass('WARNING: In order to push projects into the DB,'
        'all projects currently in the generation_plant table that are'
        'not present in the generation_plant_scenario_member table will be'
        'removed. Continue? [y/n]')
    if carry_on == 'n':
        sys.exit()

    print "\n-----------------------------"
    print "Pushing generation plants to the DB:\n"

    # Make sure the "switch" schema is on the search path

    # Drop NOT NULL constraint for load_zone_id
    query = 'ALTER TABLE "{PREFIX}generation_plant" ALTER "load_zone_id" DROP NOT NULL;'.format(PREFIX=PREFIX)
    connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True)
    query = 'ALTER TABLE "{PREFIX}generation_plant" ALTER "max_age" DROP NOT NULL;'.format(PREFIX=PREFIX)
    connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True)

    # First, define gen_scenario_id as new_disaggregated_gen_scenario_id and delete previously stored projects for the scenario id
    gen_scenario_id = new_disaggregated_gen_scenario_id
    # Also define hydro simple scenario and generation_plant_cost scenario and delete previously stored projects for these scenario ids
    hydro_scenario_id = new_disaggregated_hydro_simple_scenario_id
    generation_plant_cost_id = new_disggregated_generation_plant_cost_id

    query = 'DELETE FROM {PREFIX}hydro_historical_monthly_capacity_factors\
        WHERE hydro_simple_scenario_id = {hydro_scenario_id}'.format(PREFIX = PREFIX, hydro_scenario_id = hydro_scenario_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    query = 'DELETE FROM {PREFIX}generation_plant_scenario_member\
        WHERE generation_plant_scenario_id = {gen_scenario_id}'.format(PREFIX = PREFIX, gen_scenario_id = gen_scenario_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    query = 'DELETE FROM {PREFIX}generation_plant_cost\
        WHERE generation_plant_cost_scenario_id = {generation_plant_cost_id}'.format(PREFIX = PREFIX, generation_plant_cost_id = generation_plant_cost_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    query = 'DELETE FROM {PREFIX}generation_plant_existing_and_planned\
        WHERE generation_plant_existing_and_planned_scenario_id = {gen_scenario_id}'.format(PREFIX = PREFIX, gen_scenario_id = gen_scenario_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    # These queries are for the scenario mapping tables to add descriptions of new scenarios
    query = 'DELETE FROM {PREFIX}hydro_simple_scenario\
        WHERE hydro_simple_scenario_id = {hydro_scenario_id}'.format(PREFIX = PREFIX, hydro_scenario_id = hydro_scenario_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    query = 'DELETE FROM {PREFIX}generation_plant_cost_scenario\
        WHERE generation_plant_cost_scenario_id = {generation_plant_cost_id}'.format(PREFIX = PREFIX, generation_plant_cost_id = generation_plant_cost_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    query = 'DELETE FROM {PREFIX}generation_plant_existing_and_planned_scenario\
        WHERE generation_plant_existing_and_planned_scenario_id = {gen_scenario_id}'.format(PREFIX = PREFIX, gen_scenario_id = gen_scenario_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    query = 'DELETE FROM {PREFIX}generation_plant_scenario\
        WHERE generation_plant_scenario_id = {gen_scenario_id}'.format(PREFIX = PREFIX, gen_scenario_id = gen_scenario_id)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    # It is necessary to temporarily disable triggers when deleting from
    # generation_plant table, because of multiple fkey constraints
    query = 'SET session_replication_role = replica;\
            DELETE FROM {PREFIX}generation_plant\
            WHERE generation_plant_id NOT IN\
            (SELECT generation_plant_id FROM {PREFIX}generation_plant_scenario_member);\
            SET session_replication_role = DEFAULT;'.format(PREFIX = PREFIX)
    connect_to_db_and_run_query(query,
            database='switch_wecc', user=user, password=password, quiet=True)

    print "Deleted previously stored projects for the EIA dataset (id {}). Pushing data...".format(gen_scenario_id)

    query = 'SELECT last_value FROM generation_plant_id_seq'
    first_gen_id = connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True).iloc[0,0] + 1

    # Aded max_age to the list of uploaded columns for generators will planned
    # retirements, if a generator has no planned retirement, default max_age will be assigned in a later step
    generators_to_db = generators[['name','gen_tech','capacity_limit_mw',
        'full_load_heat_rate','max_age','is_variable','is_baseload','is_cogen',
        'energy_source','eia_plant_code', 'Latitude','Longitude','County',
        'State']].drop_duplicates()

    connect_to_db_and_push_df(df=generators_to_db,
        col_formats=("(DEFAULT,%s,%s,NULL,NULL,%s,NULL,NULL,NULL,%s,NULL,%s,NULL,%s,%s,%s,%s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,%s,%s,%s,%s,%s,NULL,NULL,NULL)"),
        table='{PREFIX}generation_plant'.format(PREFIX = PREFIX),
        database='switch_wecc', user=user, password=password, quiet=True)
    print "Successfully pushed generation plants!"

    query = 'SELECT last_value FROM generation_plant_id_seq'
    last_gen_id = connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True).iloc[0,0]

    # Populate geometry column for GIS work, using coordinate reference system 4326-WGS4 (common projection default)
    query = "UPDATE {PREFIX}generation_plant\
        SET geom = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)\
        WHERE longitude IS NOT NULL AND latitude IS NOT NULL AND\
        generation_plant_id BETWEEN {first_gen_id} AND {last_gen_id}".format(PREFIX = PREFIX, first_gen_id = first_gen_id, last_gen_id = last_gen_id)
    connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True)

    # assigning generators to load zones

    # if generator lat-lon is available assign if within load zone boundary
    print "\nAssigning load zones..."
    query = "UPDATE {PREFIX}generation_plant SET load_zone_id = z.load_zone_id\
        FROM {PREFIX}load_zone z\
        WHERE ST_contains(boundary, geom) AND\
        generation_plant_id BETWEEN {first_gen_id} AND {last_gen_id}".format(PREFIX = PREFIX, first_gen_id = first_gen_id, last_gen_id = last_gen_id)
    connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True)
    n_plants_assigned_by_lat_long = connect_to_db_and_run_query("SELECT count(*)\
        FROM {PREFIX}generation_plant WHERE load_zone_id IS NOT NULL AND\
        generation_plant_id BETWEEN {first_gen_id} AND {last_gen_id}".format(PREFIX = PREFIX, first_gen_id = first_gen_id, last_gen_id = last_gen_id),
        database='switch_wecc', user=user, password=password, quiet=True).iloc[0,0]
    print "--Assigned load zone according to lat & long to {} plants".format(
        n_plants_assigned_by_lat_long)

    #if generator lat-lon is not available, assign load zone based on state and county of generator
    query = "UPDATE {PREFIX}generation_plant g SET load_zone_id = z.load_zone_id\
        FROM {PREFIX}us_counties c\
        JOIN {PREFIX}load_zone z ON ST_contains(z.boundary, ST_centroid(c.the_geom))\
        WHERE g.load_zone_id IS NULL AND g.state = c.state_name AND g.county = c.name\
        AND generation_plant_id BETWEEN {first_gen_id} AND {last_gen_id}".format(PREFIX = PREFIX, first_gen_id = first_gen_id, last_gen_id = last_gen_id)
    connect_to_db_and_run_query(query,
        database='switch_wecc', user=user, password=password, quiet=True)
    n_plants_assigned_by_county_state = connect_to_db_and_run_query("SELECT count(*)\
        FROM {PREFIX}generation_plant WHERE load_zone_id IS NOT NULL AND\