From 5d299abb20e823dfe836388b9d03eded927e57df Mon Sep 17 00:00:00 2001
From: S NAJMABADI <shannon.najmabadi@gmail.com>
Date: Mon, 15 Aug 2016 08:56:48 -0400
Subject: [PATCH 1/2] adding donow10

---
 .../donow/najmabadi_shannon_10_donow.ipynb    | 405 ++++++++++++++++++
 1 file changed, 405 insertions(+)
 create mode 100644 class10/donow/najmabadi_shannon_10_donow.ipynb
diff --git a/class10/donow/najmabadi_shannon_10_donow.ipynb b/class10/donow/najmabadi_shannon_10_donow.ipynb
new file mode 100644
index 0000000..e154401
--- /dev/null
+++ b/class10/donow/najmabadi_shannon_10_donow.ipynb
@@ -0,0 +1,405 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a classifier to predict the wine color from wine quality attributes using this dataset: http://archive.ics.uci.edu/ml/datasets/Wine+Quality"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The data is in the database we've been using\n",
+    "+ host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com'\n",
+    "+ database='training'\n",
+    "+ port=5432\n",
+    "+ user='dot_student'\n",
+    "+ password='qgis'\n",
+    "+ table name = 'winequality'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pg8000\n",
+    "conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database='training', port=5432, user='dot_student', password='qgis')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "from sklearn import datasets\n",
+    "from sklearn import tree\n",
+    "from sklearn import metrics\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.cross_validation import cross_val_score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Query for the data and create a numpy array"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'color']\n"
+     ]
+    }
+   ],
+   "source": [
+    "cursor = conn.cursor()\n",
+    "cursor.execute(\"SELECT * FROM information_schema.columns WHERE table_name= 'winequality'\")\n",
+    "column_names = []\n",
+    "for row in cursor.fetchall():\n",
+    "    column_names.append(row[3])\n",
+    "print(column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "statement = \"SELECT fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, ph, sulphates, alcohol, color FROM winequality\"\n",
+    "cursor.execute(statement)\n",
+    "wine_quality = []\n",
+    "for row in cursor:\n",
+    "    wine_quality.append(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>fixed_acidity</th>\n",
+       "      <th>volatile_acidity</th>\n",
+       "      <th>citric_acid</th>\n",
+       "      <th>residual_sugar</th>\n",
+       "      <th>chlorides</th>\n",
+       "      <th>free_sulfur_dioxide</th>\n",
+       "      <th>total_sulfur_dioxide</th>\n",
+       "      <th>density</th>\n",
+       "      <th>ph</th>\n",
+       "      <th>sulphates</th>\n",
+       "      <th>alcohol</th>\n",
+       "      <th>color</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>7</td>\n",
+       "      <td>0.27</td>\n",
+       "      <td>0.36</td>\n",
+       "      <td>20.7</td>\n",
+       "      <td>0.045</td>\n",
+       "      <td>45</td>\n",
+       "      <td>170</td>\n",
+       "      <td>1.001</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.45</td>\n",
+       "      <td>8.8</td>\n",
+       "      <td>W</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.3</td>\n",
+       "      <td>0.3</td>\n",
+       "      <td>0.34</td>\n",
+       "      <td>1.6</td>\n",
+       "      <td>0.049</td>\n",
+       "      <td>14</td>\n",
+       "      <td>132</td>\n",
+       "      <td>0.994</td>\n",
+       "      <td>3.3</td>\n",
+       "      <td>0.49</td>\n",
+       "      <td>9.5</td>\n",
+       "      <td>W</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>8.1</td>\n",
+       "      <td>0.28</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>6.9</td>\n",
+       "      <td>0.05</td>\n",
+       "      <td>30</td>\n",
+       "      <td>97</td>\n",
+       "      <td>0.9951</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>0.44</td>\n",
+       "      <td>10.1</td>\n",
+       "      <td>W</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  fixed_acidity volatile_acidity citric_acid residual_sugar chlorides  \\\n",
+       "0             7             0.27        0.36           20.7     0.045   \n",
+       "1           6.3              0.3        0.34            1.6     0.049   \n",
+       "2           8.1             0.28         0.4            6.9      0.05   \n",
+       "\n",
+       "  free_sulfur_dioxide total_sulfur_dioxide density    ph sulphates alcohol  \\\n",
+       "0                  45                  170   1.001     3      0.45     8.8   \n",
+       "1                  14                  132   0.994   3.3      0.49     9.5   \n",
+       "2                  30                   97  0.9951  3.26      0.44    10.1   \n",
+       "\n",
+       "  color  \n",
+       "0     W  \n",
+       "1     W  \n",
+       "2     W  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame(wine_quality)\n",
+    "df.columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'color']\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Split the data into features (x) and target (y, the last column in the table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "array = np.array(df)\n",
+    "x = array[:,:11]\n",
+    "y = array[:,11]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remember you can cast the results into an numpy array and then slice out what you want"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a decision tree with the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dt = DecisionTreeClassifier()\n",
+    "dt = dt.fit(x,y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run 10-fold cross validation on the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.97538462,  0.98615385,  0.97692308,  0.98153846,  0.98153846,\n",
+       "        0.98307692,  0.97538462,  0.97230769,  0.98459168,  0.97685185])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scores = cross_val_score(dt,x,y,cv=10) \n",
+    "scores"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## If you have time, calculate the feature importance and graph based on the code in the [slides from last class](http://ledeprogram.github.io/algorithms/class9/#21)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['fixed_acidity',\n",
+       " 'volatile_acidity',\n",
+       " 'citric_acid',\n",
+       " 'residual_sugar',\n",
+       " 'chlorides',\n",
+       " 'free_sulfur_dioxide',\n",
+       " 'total_sulfur_dioxide',\n",
+       " 'density',\n",
+       " 'ph',\n",
+       " 'sulphates',\n",
+       " 'alcohol',\n",
+       " 'color']"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "columns = list(df.columns)\n",
+    "columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0, 1)"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEACAYAAABI5zaHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEAVJREFUeJzt3X2MXNdZx/HvszWl4JaINmrQbohxty1BkZIQlRAoLxOC\niVMkXPEHiZO0dRDIQk1aQQtOKi271oJoJRBtUyByMXGLXFI1rUSQEhr3ZYRa6iSFvEHsxFlvN/Zu\nmqq0pcSowrUf/pixdzLxemY247njM9+PNNLcO2fufXx2/ds755y7G5mJJKksY1UXIEnqP8Ndkgpk\nuEtSgQx3SSqQ4S5JBTLcJalAHcM9InZGxHMR8dhp2nw4Ig5ExCMRcWl/S5Qk9aqbK/c7gatXejEi\nrgEmM/MNwFbgjj7VJklapY7hnplfAr59miabgI832z4AnBMR5/WnPEnSavRjzH0CONSyvdjcJ0mq\niBOqklSgNX04xiLw4y3b5zf3vUhE+ItsJGkVMjN6ad/tlXs0H6dyD/B2gIi4AvhOZj53mgJ9ZDI9\nPV15DcPysC/sC/vi9I/V6HjlHhGfAGrAayLiGWAaeHkjp3NHZt4bEW+JiKeBI8BNq6pEktQ3HcM9\nM6/vos3N/SlHktQPTqhWpFarVV3C0LAvltkXy+yLlyZWO56zqpNF5CDPJ0kliAjyDE2oSpLOIoa7\nJBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtS\ngQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXI\ncJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVqKtwj4iNEbE/Ip6KiG2neP01EXFf\nRDwSEY9HxJa+VypJ6lpk5ukbRIwBTwFXAUvAQ8B1mbm/pc008IrMvC0izgWeBM7LzO+3HSs7nU+S\n9EIRQWZGL+/p5sr9cuBAZi5k5lHgLmBTW5uvA69qPn8V8F/twS5JGpw1XbSZAA61bB+mEfitPgp8\nPiKWgFcC1/anPEnSanQT7t24DXg0M6+MiElgT0RcnJnPtzecmZk5+bxWq1Gr1fpUgiSVoV6vU6/X\nX9IxuhlzvwKYycyNze1bgczMD7S0uRf408z8cnP788C2zPxq27Ecc5ekHp2pMfeHgNdHxLqIeDlw\nHXBPW5t9wK82izgPeCNwsJdCJEn903FYJjOPRcTNwP00fhjszMx9EbG18XLuAP4MuDMiHgUC+KPM\n/NaZLFyStLKOwzJ9PZnDMpLUszM1LCNJOssY7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrsk\nFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCtTxD2RL\nOvPm5xeYmtrF4uJxJibGmJ3dwvr166ouS2cx/0C2VLH5+QU2bLidubntwFrgCJOT0+zZc4sBL8A/\nkC2dlaamdrUEO8Ba5ua2MzW1q8KqdLYz3KWKLS4eZznYT1jL0tLxKspRIQx3qWITE2PAkba9Rxgf\n97+nVs/vHqlis7NbmJycZjngG2Pus7NbKqtJZz8nVKUhcGK1zNLSccbHXS2jF1rNhKrhLklDztUy\nkiTAcJekIhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUBdhXtEbIyI/RHxVERsW6FNLSIejoj/\niIgv9rdMSVIvOt6hGhFjwFPAVcAS8BBwXWbub2lzDvCvwK9l5mJEnJuZ3zzFsbxDVZJ6dKbuUL0c\nOJCZC5l5FLgL2NTW5nrg05m5CHCqYJckDU434T4BHGrZPtzc1+qNwKsj4osR8VBEvK1fBUqSetev\nv6G6BrgM+BUaf3XgKxHxlcx8uk/HlyT1oJtwXwQuaNk+v7mv1WHgm5n5PeB7EfEvwCXAi8J9Zmbm\n5PNarUatVuutYkkqXL1ep16vv6RjdDOh+jLgSRoTqs8CDwKbM3NfS5sLgduBjcAPAg8A12bmE23H\nckJVknq0mgnVjlfumXksIm4G7qcxRr8zM/dFxNbGy7kjM/dHxGeBx4BjwI72YJckDY5/rEOShpx/\nrEOSBBjuklQkw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3\nSSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJek\nAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgboK94jY\nGBH7I+KpiNh2mnY/ExFHI+I3+1eiJKlXHcM9IsaAjwBXAxcBmyPiwhXavR/4bL+LlCT1ppsr98uB\nA5m5kJlHgbuATadodwtwN/CNPtYnSVqFbsJ9AjjUsn24ue+kiBgH3pqZfwNE/8qTJK1GvyZUPwi0\njsUb8JJUoTVdtFkELmjZPr+5r9WbgLsiIoBzgWsi4mhm3tN+sJmZmZPPa7UatVqtx5IlqWz1ep16\nvf6SjhGZefoGES8DngSuAp4FHgQ2Z+a+FdrfCfxTZn7mFK9lp/NJkl4oIsjMnkZEOl65Z+axiLgZ\nuJ/GMM7OzNwXEVsbL+eO9rf0UoAkqf86Xrn39WReuUtSz1Zz5e4dqpJUIMNdkgpkuEtSgQx3SSqQ\n4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnu\nklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5J\nBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoHWVF2ARtf8/AJTU7tYXDzOxMQYs7NbWL9+XdVl\nSUWIzOzcKGIj8EEaV/o7M/MDba9fD2xrbv4P8HuZ+fgpjpPdnE/lm59fYMOG25mb2w6sBY4wOTnN\nnj23GPBSm4ggM6OX93QclomIMeAjwNXARcDmiLiwrdlB4Jcy8xLgT4CP9lKERs/U1K6WYAdYy9zc\ndqamdlVYlVSObsbcLwcOZOZCZh4F7gI2tTbIzL2Z+d/Nzb3ARH/LVGkWF4+zHOwnrGVp6XgV5UjF\n6SbcJ4BDLduHOX14/w5w30spSuWbmBgDjrTtPcL4uHP8Uj/0dUI1Iq4EbgJ+YaU2MzMzJ5/XajVq\ntVo/S9BZYnZ2C3v3Tr9ozH129paKK5OqV6/XqdfrL+kYHSdUI+IKYCYzNza3bwXyFJOqFwOfBjZm\n5twKx3JCVSedWC2ztHSc8XFXy0grWc2Eajfh/jLgSeAq4FngQWBzZu5raXMB8HngbZm59zTHMtwl\nqUerCfeOwzKZeSwibgbuZ3kp5L6I2Np4OXcAU8Crgb+OiACOZublvf8TJEn90NU6976dzCt3SerZ\nGVnnLkk6+xjuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtS\ngQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXI\ncJekAhnuklQgw12SCrSm6gIGaX5+gampXSwuHmdiYozZ2S2sX7+u6rIkqe8iMwd3sogc5Plazc8v\nsGHD7czNbQfWAkeYnJxmz55bDHhJQy0iyMzo5T0jMywzNbWrJdgB1jI3t52pqV0VViVJZ8bIhPvi\n4nGWg/2EtSwtHa+iHEk6o0Ym3CcmxoAjbXuPMD4+Ml0gaYSMTLLNzm5hcnKa5YBvjLnPzm6prCZJ\nOlNGZkIVllfLLC0dZ3zc1TJSO1eULRumvljNhOpIhbuklbmibNmw9MWJHzC7d8+cmdUyEbExIvZH\nxFMRsW2FNh+OiAMR8UhEXNpLEZKqNywryubnF7jxxu1ceeU0N964nfn5hYGeH4ajL078gNm9+72r\nen/HcI+IMeAjwNXARcDmiLiwrc01wGRmvgHYCtyx0vGq+mINixPfuJde+o5K+2IY/gOdUK/XKzs3\n2BcnDMOKstZAq9evZPfu97Jhw+0D/5oMQ1+8+AdMjzLztA/gCuC+lu1bgW1tbe4Arm3Z3gecd4pj\nJTyfk5PvyYMHv5aj5uDBr+Xk5HsSnk+YrqwvXlhHVv41mZ6eruS8mfZFqxtumGnphzzZHzfcMFNR\nDdOV1PDiOqrpi1rtj1vOTWaHrG5/dDMsMwEcatk+3Nx3ujaLp2jTNLo3Dw3DR71hqmMY2BfLhmFF\n2TBcMcNw9MWpl293r6LfLTOaNw8NyzfusNQxDOyLZevXr2PPnluYmvrzlhVlg51AXA601q/J4O9H\nGYa+mJ3dwt69082Lj951XC0TEVcAM5m5sbl9K42PCB9oaXMH8MXM/GRzez/wy5n5XNuxXCojSauQ\nPa6W6ebK/SHg9RGxDngWuA7Y3NbmHuCdwCebPwy+0x7sqylOkrQ6HcM9M49FxM3A/TRW1+zMzH0R\nsbXxcu7IzHsj4i0R8TSNz1Q3ndmyJUmnM9CbmCRJgzGwWYpuboQaBRFxfkR8ISL+MyIej4h3VV1T\nlSJiLCL+PSLuqbqWqkXEORHxqYjY1/z++Nmqa6pCRNzW/Pc/FhG7I+LlVdc0SBGxMyKei4jHWvb9\naETcHxFPRsRnI+KcTscZSLh3cyPUCPk+8AeZeRHwc8A7R7gvAN4NPFF1EUPiQ8C9mflTwCU07hcZ\nKc25vd8FfjozL6YxdHxdtVUN3J00srLVrcDnMvMngS8At3U6yKCu3C8HDmTmQmYeBe4CNg3o3EMl\nM7+emY80nz9P4z/wCvcElC0izgfeAvxt1bVULSJ+BPjFzLwTIDO/n5nfrbisKnwX+D9gbUSsAX4Y\nWKq2pMHKzC8B327bvQn4WPP5x4C3djrOoMK9mxuhRk5E/ARwKfBAtZVU5i+BPwSc+IH1wDcj4s7m\nMNWOiPihqosatMz8NvAXwDM0bob8TmZ+rtqqhsJrT6xAzMyvA6/t9IaR+X3uwyYiXgncDby7eQU/\nUiLi14Hnmp9iovkYZWuAy4C/yszLgP+l8VF8pETE64DfB9YB48ArI+L6aqsaSh0viAYV7ovABS3b\n5zf3jaTmx827gb/PzH+sup6KvBn4jYg4CPwDcGVEfLzimqp0GDiUmV9tbt9NI+xHzZuAL2fmtzLz\nGPAZ4OcrrmkYPBcR5wFExI8B3+j0hkGF+8kboZoz39fRuPFpVP0d8ERmfqjqQqqSme/LzAsy83U0\nvh++kJlvr7quqjQ/ch+KiDc2d13FaE40PwlcERGviIig0Q8jN7HMiz/N3gNsaT5/B9DxonAgv1tm\npRuhBnHuYRMRbwZuAB6PiIdpfLx6X2b+c7WVaQi8C9gdET8AHGQEbwbMzEebn+D+DTgGPAzsqLaq\nwYqITwA14DUR8QwwDbwf+FRE/DawAPxWx+N4E5MklccJVUkqkOEuSQUy3CWpQIa7JBXIcJekAhnu\nklQgw12SCmS4S1KB/h+FQThUEJWxmgAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x1140004e0>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "plt.plot(dt.feature_importances_, 'o')\n",
+    "plt.ylim(0,1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use [this tip for getting the column names from your cursor object](http://stackoverflow.com/questions/10252247/how-do-i-get-a-list-of-column-names-from-a-psycopg2-cursor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#alternative:\n",
+    "\n",
+    "df = pd.read_sql('SELECT * FROM winequality', conn)\n",
+    "wine = df.as_matrix()\n",
+    "\n",
+    "x = wine[:,:-1]\n",
+    "y = wine[:,-1]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

From 773757f2db0e5ee4e6bc78595b3e963aac9f712a Mon Sep 17 00:00:00 2001
From: S NAJMABADI <shannon.najmabadi@gmail.com>
Date: Mon, 15 Aug 2016 09:01:38 -0400
Subject: [PATCH 2/2] adding hw9

---
 class9/homework/najmabadi_shannon_9_1.ipynb | 153 ++++++++++++++
 class9/homework/najmabadi_shannon_9_2.ipynb | 211 ++++++++++++++++++++
 2 files changed, 364 insertions(+)
 create mode 100644 class9/homework/najmabadi_shannon_9_1.ipynb
 create mode 100644 class9/homework/najmabadi_shannon_9_2.ipynb

diff --git a/class9/homework/najmabadi_shannon_9_1.ipynb b/class9/homework/najmabadi_shannon_9_1.ipynb
new file mode 100644
index 0000000..c7e9f66
--- /dev/null
+++ b/class9/homework/najmabadi_shannon_9_1.ipynb
@@ -0,0 +1,153 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Assignment 1\n",
+    "\n",
+    "Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into 5 equal-sized sets\n",
+    "Don't forget to shuffle the input before assigning to sets\n",
+    "You can use the fit(), predict(), and score() functions of your model in your functions\n",
+    "Test the results with the sklearn cross_val_score\n",
+    "In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import numpy as np\n",
+    "from sklearn import tree\n",
+    "from sklearn import metrics\n",
+    "from sklearn import datasets\n",
+    "from sklearn.cross_validation import cross_val_score\n",
+    "\n",
+    "iris = datasets.load_iris()\n",
+    "x = iris.data[:,2:]\n",
+    "y = iris.target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "a = list(zip(x,y)) #Zip the dataa\n",
+    "random.shuffle(a) #Shuffle the data\n",
+    "x,y = zip(*a)  #Unzip the data (*)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Array 1: score of 0.933333333333\n",
+      "Array 2: score of 0.966666666667\n",
+      "Array 3: score of 0.9\n",
+      "Array 4: score of 0.966666666667\n",
+      "Array 5: score of 0.933333333333\n"
+     ]
+    }
+   ],
+   "source": [
+    "number_of_splices = 5 #Say how many splices we're dividing it into\n",
+    "list_length = len(a) #Make a variable name for the length of the list \n",
+    "splice_size = int(list_length / number_of_splices) #Set a variable for the size of each splice\n",
+    "\n",
+    "for i in range(1, number_of_splices + 1): #Loop through the splices\n",
+    "    x_test = x[int(splice_size) * (i-1): int(splice_size * i)] #Separate out testing data. If we break this down, we have int: int, which is making smaller arrays in the size of [int, inclusive: int, exclusive]. The ints say where the index should start and end. In the first int, we multiply the splice size by i-1 because we want to start at index 0.  \n",
+    "    y_test = y[int(splice_size) * (i-1): int(splice_size * i)] \n",
+    "    x_train = x[0: int(splice_size * (i-1))] + x[int(splice_size * i): int(list_length)- 1] #Separate out training data\n",
+    "    y_train = y[0: int(splice_size * (i-1))] + y[int(splice_size * i): int(list_length) - 1] \n",
+    "\n",
+    "    dt = tree.DecisionTreeClassifier().fit(x_train, y_train) \n",
+    "    \n",
+    "    y_pred = dt.predict(x_test) \n",
+    "    score = metrics.accuracy_score(y_test, y_pred)\n",
+    "    print(\"Array \" + str(i) + \": score of \" + str(score))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 164,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9447619047619048"
+      ]
+     },
+     "execution_count": 164,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(score_list) #Get the average score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 181,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.95333333333333348"
+      ]
+     },
+     "execution_count": 181,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scores = cross_val_score(dt,x,y,cv=5) #Test it using cross validation function\n",
+    "np.mean(scores) #Compare score means"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/class9/homework/najmabadi_shannon_9_2.ipynb b/class9/homework/najmabadi_shannon_9_2.ipynb
new file mode 100644
index 0000000..d37d9e8
--- /dev/null
+++ b/class9/homework/najmabadi_shannon_9_2.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Assignment 2\n",
+    "\n",
+    "Using the readings, try and create a RandomForestClassifier for the iris dataset\n",
+    "Using a 25/75 training/test split, compare the results with the original decision tree model and describe the result to the best of your ability in your PR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn import tree\n",
+    "from sklearn import metrics\n",
+    "from sklearn import datasets\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.cross_validation import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "iris = datasets.load_iris()\n",
+    "x = iris.data[:,2:]\n",
+    "y = iris.target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
+       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
+       "            min_samples_leaf=1, min_samples_split=2,\n",
+       "            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,\n",
+       "            oob_score=False, random_state=42, verbose=0, warm_start=False)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,train_size=0.75)\n",
+    "forest = RandomForestClassifier(n_estimators=5, random_state=42)\n",
+    "forest.fit(x_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training set accuracy: 1.000000\n",
+      "Testing set accuracy: 0.947368\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Training set accuracy: %f\" % forest.score(x_train, y_train))\n",
+    "print(\"Testing set accuracy: %f\" % forest.score(x_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, train_size=0.25)\n",
+    "dt = tree.DecisionTreeClassifier()\n",
+    "dt = dt.fit(x_train, y_train)\n",
+    "\n",
+    "def measure_performance(x,y,dt, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):\n",
+    "    y_pred=dt.predict(x)\n",
+    "    if show_accuracy:\n",
+    "        print(\"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y, y_pred)),\"\\n\")\n",
+    "    if show_classification_report:\n",
+    "        print(\"Classification report\")\n",
+    "        print(metrics.classification_report(y,y_pred),\"\\n\")\n",
+    "    if show_confussion_matrix:\n",
+    "        print(\"Confusion matrix\")\n",
+    "        print(metrics.confusion_matrix(y,y_pred),\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training set:\n",
+      "Accuracy:1.000 \n",
+      "\n",
+      "Classification report\n",
+      "             precision    recall  f1-score   support\n",
+      "\n",
+      "          0       1.00      1.00      1.00        12\n",
+      "          1       1.00      1.00      1.00        12\n",
+      "          2       1.00      1.00      1.00        13\n",
+      "\n",
+      "avg / total       1.00      1.00      1.00        37\n",
+      " \n",
+      "\n",
+      "Confusion matrix\n",
+      "[[12  0  0]\n",
+      " [ 0 12  0]\n",
+      " [ 0  0 13]] \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Training set:\") \n",
+    "measure_performance(x_train, y_train,dt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing set:\n",
+      "Accuracy:0.974 \n",
+      "\n",
+      "Classification report\n",
+      "             precision    recall  f1-score   support\n",
+      "\n",
+      "          0       1.00      1.00      1.00        13\n",
+      "          1       0.93      1.00      0.96        13\n",
+      "          2       1.00      0.92      0.96        12\n",
+      "\n",
+      "avg / total       0.98      0.97      0.97        38\n",
+      " \n",
+      "\n",
+      "Confusion matrix\n",
+      "[[13  0  0]\n",
+      " [ 0 13  0]\n",
+      " [ 0  1 11]] \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Testing set:\")\n",
+    "measure_performance(x_test,y_test,dt)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	ph	sulphates	alcohol	color
0	7	0.27	0.36	20.7	0.045	45	170	1.001	3	0.45	8.8	W
1	6.3	0.3	0.34	1.6	0.049	14	132	0.994	3.3	0.49	9.5	W
2	8.1	0.28	0.4	6.9	0.05	30	97	0.9951	3.26	0.44	10.1	W