diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..846faa2 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 231080c..9103d8d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ -*.csv -notebooks/.ipynb_checkpoints/* -notebooks/.ipynb_checkpoints/ \ No newline at end of file +input/ +input/*.csv +input/properties_2017.csv +datasets/ +.ipynb_checkpoints/ +notebooks/mini_lab_1_template_yao.ipynb +notebooks/mini_lab_1_template_yao.ipynb \ No newline at end of file diff --git a/Old/albert_old.ipynb b/Old/albert_old.ipynb deleted file mode 100644 index 479650e..0000000 --- a/Old/albert_old.ipynb +++ /dev/null @@ -1,1789 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "# Data Mining Project - zillow" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Data Loading" - ] - }, - { - "cell_type": "code", -<<<<<<< Updated upstream - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, -======= - "execution_count": 38, - "metadata": {}, ->>>>>>> Stashed changes - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (22,32,34,49,55) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" - ] - }, - { - "data": { - "text/html": [ - "
| \n", - " | parcelid | \n", - "airconditioningtypeid | \n", - "architecturalstyletypeid | \n", - "basementsqft | \n", - "bathroomcnt | \n", - "bedroomcnt | \n", - "buildingclasstypeid | \n", - "buildingqualitytypeid | \n", - "calculatedbathnbr | \n", - "decktypeid | \n", - "... | \n", - "numberofstories | \n", - "fireplaceflag | \n", - "structuretaxvaluedollarcnt | \n", - "taxvaluedollarcnt | \n", - "assessmentyear | \n", - "landtaxvaluedollarcnt | \n", - "taxamount | \n", - "taxdelinquencyflag | \n", - "taxdelinquencyyear | \n", - "censustractandblock | \n", - "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", - "10754147 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "0.0 | \n", - "0.0 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "9.0 | \n", - "2015.0 | \n", - "9.0 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "
| 1 | \n", - "10759547 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "0.0 | \n", - "0.0 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "27516.0 | \n", - "2015.0 | \n", - "27516.0 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "
| 2 | \n", - "10843547 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "0.0 | \n", - "0.0 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "NaN | \n", - "650756.0 | \n", - "1413387.0 | \n", - "2015.0 | \n", - "762631.0 | \n", - "20800.37 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "
| 3 | \n", - "10859147 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "0.0 | \n", - "0.0 | \n", - "3.0 | \n", - "7.0 | \n", - "NaN | \n", - "NaN | \n", - "... | \n", - "1.0 | \n", - "NaN | \n", - "571346.0 | \n", - "1156834.0 | \n", - "2015.0 | \n", - "585488.0 | \n", - "14557.57 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "
| 4 | \n", - "10879947 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "0.0 | \n", - "0.0 | \n", - "4.0 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "NaN | \n", - "193796.0 | \n", - "433491.0 | \n", - "2015.0 | \n", - "239695.0 | \n", - "5725.17 | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "
5 rows × 58 columns
\n", - "| Variable | Type | Scale | Description |
|---|---|---|---|
| airconditioningtypeid | nominal | [0.0, 1.0, 13.0, 5.0, 11.0, 9.0, 12.0, 3.0] | Type of cooling system present in the home (if any) |
| architecturalstyletypeid | nominal | [0.0, 7.0, 21.0, 8.0, 2.0, 3.0, 5.0, 10.0, 27.0] | Architectural style of the home (i.e. ranch, colonial, split-level, etc…) |
| basementsqft | ratio | (20, 8516) | Finished living area below or partially below ground level |
| bathroomcnt | ordinal | [0.0, 2.0, 4.0, 3.0, 1.0, ... (38 More)] | Number of bathrooms in home including fractional bathrooms |
| bedroomcnt | ordinal | [0.0, 4.0, 5.0, 2.0, 3.0, ... (22 More)] | Number of bedrooms in home |
| buildingqualitytypeid | ordinal | [0.0, 7.0, 4.0, 10.0, 1.0, ... (13 More)] | Overall assessment of condition of the building from best (lowest) to worst (highest) |
| buildingclasstypeid | nominal | [0.0, 3.0, 4.0, 5.0, 2.0, 1.0] | The building framing type (steel frame, wood frame, concrete/brick) |
| calculatedbathnbr | ordinal | [0.0, 2.0, 4.0, 3.0, 1.0, ... (35 More)] | Number of bathrooms in home including fractional bathroom |
| decktypeid | nominal | [0.0, 66.0] | Type of deck (if any) present on parcel |
| threequarterbathnbr | ordinal | [0.0, 1.0, 2.0, 4.0, 3.0, 6.0, 5.0, 7.0] | Number of 3/4 bathrooms in house (shower + sink + toilet) |
| finishedfloor1squarefeet | ratio | (3, 31303) | Size of the finished living area on the first (entry) floor of the home |
| calculatedfinishedsquarefeet | ratio | (1, 952576) | Calculated total finished living area of the home |
| finishedsquarefeet6 | ratio | (117, 952576) | Base unfinished and finished area |
| finishedsquarefeet12 | ratio | (1, 290345) | Finished living area |
| finishedsquarefeet13 | ratio | (120, 2688) | Perimeter living area |
| finishedsquarefeet15 | ratio | (112, 820242) | Total area |
| finishedsquarefeet50 | ratio | (3, 31303) | Size of the finished living area on the first (entry) floor of the home |
| \n", - " | Variable Name | \n", - "Number Missing Values | \n", - "Precent Missing | \n", - "
|---|---|---|---|
| 0 | \n", - "parcelid | \n", - "0 | \n", - "0.000000 | \n", - "
| 1 | \n", - "airconditioningtypeid | \n", - "2173698 | \n", - "72.815410 | \n", - "
| 2 | \n", - "architecturalstyletypeid | \n", - "2979156 | \n", - "99.796966 | \n", - "
| 3 | \n", - "basementsqft | \n", - "2983589 | \n", - "99.945465 | \n", - "
| 4 | \n", - "bathroomcnt | \n", - "11462 | \n", - "0.383959 | \n", - "
| 5 | \n", - "bedroomcnt | \n", - "11450 | \n", - "0.383557 | \n", - "
| 6 | \n", - "buildingclasstypeid | \n", - "2972588 | \n", - "99.576949 | \n", - "
| 7 | \n", - "buildingqualitytypeid | \n", - "1046729 | \n", - "35.063749 | \n", - "
| 8 | \n", - "calculatedbathnbr | \n", - "128912 | \n", - "4.318346 | \n", - "
| 9 | \n", - "decktypeid | \n", - "2968121 | \n", - "99.427311 | \n", - "
| 10 | \n", - "finishedfloor1squarefeet | \n", - "2782500 | \n", - "93.209304 | \n", - "
| 11 | \n", - "calculatedfinishedsquarefeet | \n", - "55565 | \n", - "1.861339 | \n", - "
| 12 | \n", - "finishedsquarefeet12 | \n", - "276033 | \n", - "9.246664 | \n", - "
| 13 | \n", - "finishedsquarefeet13 | \n", - "2977545 | \n", - "99.743000 | \n", - "
| 14 | \n", - "finishedsquarefeet15 | \n", - "2794419 | \n", - "93.608572 | \n", - "
| 15 | \n", - "finishedsquarefeet50 | \n", - "2782500 | \n", - "93.209304 | \n", - "
| 16 | \n", - "finishedsquarefeet6 | \n", - "2963216 | \n", - "99.263002 | \n", - "
| 17 | \n", - "fips | \n", - "11437 | \n", - "0.383121 | \n", - "
| 18 | \n", - "fireplacecnt | \n", - "2672580 | \n", - "89.527160 | \n", - "
| 19 | \n", - "fullbathcnt | \n", - "128912 | \n", - "4.318346 | \n", - "
| 20 | \n", - "garagecarcnt | \n", - "2101950 | \n", - "70.411967 | \n", - "
| 21 | \n", - "garagetotalsqft | \n", - "2101950 | \n", - "70.411967 | \n", - "
| 22 | \n", - "hashottuborspa | \n", - "2916203 | \n", - "97.688141 | \n", - "
| 23 | \n", - "heatingorsystemtypeid | \n", - "1178816 | \n", - "39.488453 | \n", - "
| 24 | \n", - "latitude | \n", - "11437 | \n", - "0.383121 | \n", - "
| 25 | \n", - "longitude | \n", - "11437 | \n", - "0.383121 | \n", - "
| 26 | \n", - "lotsizesquarefeet | \n", - "276099 | \n", - "9.248875 | \n", - "
| 27 | \n", - "poolcnt | \n", - "2467683 | \n", - "82.663438 | \n", - "
| 28 | \n", - "poolsizesum | \n", - "2957257 | \n", - "99.063385 | \n", - "
| 29 | \n", - "pooltypeid10 | \n", - "2948278 | \n", - "98.762603 | \n", - "
| 30 | \n", - "pooltypeid2 | \n", - "2953142 | \n", - "98.925539 | \n", - "
| 31 | \n", - "pooltypeid7 | \n", - "2499758 | \n", - "83.737899 | \n", - "
| 32 | \n", - "propertycountylandusecode | \n", - "12277 | \n", - "0.411260 | \n", - "
| 33 | \n", - "propertylandusetypeid | \n", - "11437 | \n", - "0.383121 | \n", - "
| 34 | \n", - "propertyzoningdesc | \n", - "1006588 | \n", - "33.719090 | \n", - "
| 35 | \n", - "rawcensustractandblock | \n", - "11437 | \n", - "0.383121 | \n", - "
| 36 | \n", - "regionidcity | \n", - "62845 | \n", - "2.105207 | \n", - "
| 37 | \n", - "regionidcounty | \n", - "11437 | \n", - "0.383121 | \n", - "
| 38 | \n", - "regionidneighborhood | \n", - "1828815 | \n", - "61.262381 | \n", - "
| 39 | \n", - "regionidzip | \n", - "13980 | \n", - "0.468308 | \n", - "
| 40 | \n", - "roomcnt | \n", - "11475 | \n", - "0.384394 | \n", - "
| 41 | \n", - "storytypeid | \n", - "2983593 | \n", - "99.945599 | \n", - "
| 42 | \n", - "threequarterbathnbr | \n", - "2673586 | \n", - "89.560859 | \n", - "
| 43 | \n", - "typeconstructiontypeid | \n", - "2978470 | \n", - "99.773986 | \n", - "
| 44 | \n", - "unitcnt | \n", - "1007727 | \n", - "33.757244 | \n", - "
| 45 | \n", - "yardbuildingsqft17 | \n", - "2904862 | \n", - "97.308236 | \n", - "
| 46 | \n", - "yardbuildingsqft26 | \n", - "2982570 | \n", - "99.911330 | \n", - "
| 47 | \n", - "yearbuilt | \n", - "59928 | \n", - "2.007492 | \n", - "
| 48 | \n", - "numberofstories | \n", - "2303148 | \n", - "77.151778 | \n", - "
| 49 | \n", - "fireplaceflag | \n", - "2980054 | \n", - "99.827048 | \n", - "
| 50 | \n", - "structuretaxvaluedollarcnt | \n", - "54982 | \n", - "1.841809 | \n", - "
| 51 | \n", - "taxvaluedollarcnt | \n", - "42550 | \n", - "1.425357 | \n", - "
| 52 | \n", - "assessmentyear | \n", - "11439 | \n", - "0.383188 | \n", - "
| 53 | \n", - "landtaxvaluedollarcnt | \n", - "67733 | \n", - "2.268947 | \n", - "
| 54 | \n", - "taxamount | \n", - "31250 | \n", - "1.046825 | \n", - "
| 55 | \n", - "taxdelinquencyflag | \n", - "2928755 | \n", - "98.108613 | \n", - "
| 56 | \n", - "taxdelinquencyyear | \n", - "2928753 | \n", - "98.108546 | \n", - "
| 57 | \n", - "censustractandblock | \n", - "75126 | \n", - "2.516601 | \n", - "
| \n", - " | parcelid | \n", - "airconditioningtypeid | \n", - "architecturalstyletypeid | \n", - "basementsqft | \n", - "bathroomcnt | \n", - "bedroomcnt | \n", - "buildingclasstypeid | \n", - "buildingqualitytypeid | \n", - "calculatedbathnbr | \n", - "decktypeid | \n", - "... | \n", - "yearbuilt | \n", - "numberofstories | \n", - "fireplaceflag | \n", - "structuretaxvaluedollarcnt | \n", - "taxvaluedollarcnt | \n", - "assessmentyear | \n", - "landtaxvaluedollarcnt | \n", - "taxamount | \n", - "taxdelinquencyyear | \n", - "censustractandblock | \n", - "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "1628.000000 | \n", - "2.973755e+06 | \n", - "2.973767e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "... | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "2.985217e+06 | \n", - "
| mean | \n", - "1.332586e+07 | \n", - "5.249796e-01 | \n", - "1.462373e-02 | \n", - "646.883292 | \n", - "2.209143e+00 | \n", - "3.088949e+00 | \n", - "1.576267e-02 | \n", - "3.756424e+00 | \n", - "2.148178e+00 | \n", - "3.779745e-01 | \n", - "... | \n", - "1.924829e+03 | \n", - "3.202096e-01 | \n", - "1.729523e-03 | \n", - "1.677362e+05 | \n", - "4.144857e+05 | \n", - "2.007278e+03 | \n", - "2.467494e+05 | \n", - "5.320825e+03 | \n", - "2.627685e-01 | \n", - "5.896216e+13 | \n", - "
| std | \n", - "7.909966e+06 | \n", - "1.852887e+00 | \n", - "3.422926e-01 | \n", - "538.793473 | \n", - "1.077754e+00 | \n", - "1.275859e+00 | \n", - "2.440230e-01 | \n", - "3.120234e+00 | \n", - "1.073639e+00 | \n", - "4.980307e+00 | \n", - "... | \n", - "2.764763e+02 | \n", - "6.423593e-01 | \n", - "4.155156e-02 | \n", - "3.990106e+05 | \n", - "7.228719e+05 | \n", - "1.244937e+02 | \n", - "4.415392e+05 | \n", - "9.151294e+03 | \n", - "1.925471e+00 | \n", - "9.479036e+12 | \n", - "
| min | \n", - "1.071172e+07 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "20.000000 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "... | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "-1.000000e+00 | \n", - "
| 25% | \n", - "1.164371e+07 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "272.000000 | \n", - "2.000000e+00 | \n", - "2.000000e+00 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "2.000000e+00 | \n", - "0.000000e+00 | \n", - "... | \n", - "1.950000e+03 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "7.216500e+04 | \n", - "1.738980e+05 | \n", - "2.015000e+03 | \n", - "6.821400e+04 | \n", - "2.409000e+03 | \n", - "0.000000e+00 | \n", - "6.037301e+13 | \n", - "
| 50% | \n", - "1.254509e+07 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "534.000000 | \n", - "2.000000e+00 | \n", - "3.000000e+00 | \n", - "0.000000e+00 | \n", - "4.000000e+00 | \n", - "2.000000e+00 | \n", - "0.000000e+00 | \n", - "... | \n", - "1.963000e+03 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "1.205040e+05 | \n", - "3.018630e+05 | \n", - "2.015000e+03 | \n", - "1.624370e+05 | \n", - "3.956000e+03 | \n", - "0.000000e+00 | \n", - "6.037555e+13 | \n", - "
| 75% | \n", - "1.409712e+07 | \n", - "1.000000e+00 | \n", - "0.000000e+00 | \n", - "847.250000 | \n", - "3.000000e+00 | \n", - "4.000000e+00 | \n", - "0.000000e+00 | \n", - "7.000000e+00 | \n", - "3.000000e+00 | \n", - "0.000000e+00 | \n", - "... | \n", - "1.981000e+03 | \n", - "0.000000e+00 | \n", - "0.000000e+00 | \n", - "1.947780e+05 | \n", - "4.840000e+05 | \n", - "2.015000e+03 | \n", - "3.022000e+05 | \n", - "6.166000e+03 | \n", - "0.000000e+00 | \n", - "6.059042e+13 | \n", - "
| max | \n", - "1.696019e+08 | \n", - "1.300000e+01 | \n", - "2.700000e+01 | \n", - "8516.000000 | \n", - "2.000000e+01 | \n", - "2.000000e+01 | \n", - "5.000000e+00 | \n", - "1.200000e+01 | \n", - "2.000000e+01 | \n", - "6.600000e+01 | \n", - "... | \n", - "2.015000e+03 | \n", - "4.100000e+01 | \n", - "1.000000e+00 | \n", - "2.514860e+08 | \n", - "2.827860e+08 | \n", - "2.016000e+03 | \n", - "9.024622e+07 | \n", - "3.458861e+06 | \n", - "9.900000e+01 | \n", - "4.830301e+14 | \n", - "
8 rows × 55 columns
\n", - "