From 5d299abb20e823dfe836388b9d03eded927e57df Mon Sep 17 00:00:00 2001 From: S NAJMABADI Date: Mon, 15 Aug 2016 08:56:48 -0400 Subject: [PATCH 1/2] adding donow10 --- .../donow/najmabadi_shannon_10_donow.ipynb | 405 ++++++++++++++++++ 1 file changed, 405 insertions(+) create mode 100644 class10/donow/najmabadi_shannon_10_donow.ipynb diff --git a/class10/donow/najmabadi_shannon_10_donow.ipynb b/class10/donow/najmabadi_shannon_10_donow.ipynb new file mode 100644 index 0000000..e154401 --- /dev/null +++ b/class10/donow/najmabadi_shannon_10_donow.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a classifier to predict the wine color from wine quality attributes using this dataset: http://archive.ics.uci.edu/ml/datasets/Wine+Quality" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The data is in the database we've been using\n", + "+ host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com'\n", + "+ database='training'\n", + "+ port=5432\n", + "+ user='dot_student'\n", + "+ password='qgis'\n", + "+ table name = 'winequality'" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pg8000\n", + "conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database='training', port=5432, user='dot_student', password='qgis')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "from sklearn import datasets\n", + "from sklearn import tree\n", + "from sklearn import metrics\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.cross_validation import cross_val_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query for the data and create a numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'color']\n" + ] + } + ], + "source": [ + "cursor = conn.cursor()\n", + "cursor.execute(\"SELECT * FROM information_schema.columns WHERE table_name= 'winequality'\")\n", + "column_names = []\n", + "for row in cursor.fetchall():\n", + " column_names.append(row[3])\n", + "print(column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "statement = \"SELECT fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, ph, sulphates, alcohol, color FROM winequality\"\n", + "cursor.execute(statement)\n", + "wine_quality = []\n", + "for row in cursor:\n", + " wine_quality.append(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fixed_acidityvolatile_aciditycitric_acidresidual_sugarchloridesfree_sulfur_dioxidetotal_sulfur_dioxidedensityphsulphatesalcoholcolor
070.270.3620.70.045451701.00130.458.8W
16.30.30.341.60.049141320.9943.30.499.5W
28.10.280.46.90.0530970.99513.260.4410.1W
\n", + "
" + ], + "text/plain": [ + " fixed_acidity volatile_acidity citric_acid residual_sugar chlorides \\\n", + "0 7 0.27 0.36 20.7 0.045 \n", + "1 6.3 0.3 0.34 1.6 0.049 \n", + "2 8.1 0.28 0.4 6.9 0.05 \n", + "\n", + " free_sulfur_dioxide total_sulfur_dioxide density ph sulphates alcohol \\\n", + "0 45 170 1.001 3 0.45 8.8 \n", + "1 14 132 0.994 3.3 0.49 9.5 \n", + "2 30 97 0.9951 3.26 0.44 10.1 \n", + "\n", + " color \n", + "0 W \n", + "1 W \n", + "2 W " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(wine_quality)\n", + "df.columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'color']\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split the data into features (x) and target (y, the last column in the table)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "array = np.array(df)\n", + "x = array[:,:11]\n", + "y = array[:,11]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Remember you can cast the results into an numpy array and then slice out what you want" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a decision tree with the data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dt = DecisionTreeClassifier()\n", + "dt = dt.fit(x,y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run 10-fold cross validation on the model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.97538462, 0.98615385, 0.97692308, 0.98153846, 0.98153846,\n", + " 0.98307692, 0.97538462, 0.97230769, 0.98459168, 0.97685185])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores = cross_val_score(dt,x,y,cv=10) \n", + "scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## If you have time, calculate the feature importance and graph based on the code in the [slides from last class](http://ledeprogram.github.io/algorithms/class9/#21)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['fixed_acidity',\n", + " 'volatile_acidity',\n", + " 'citric_acid',\n", + " 'residual_sugar',\n", + " 'chlorides',\n", + " 'free_sulfur_dioxide',\n", + " 'total_sulfur_dioxide',\n", + " 'density',\n", + " 'ph',\n", + " 'sulphates',\n", + " 'alcohol',\n", + " 'color']" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = list(df.columns)\n", + "columns" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 1)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEACAYAAABI5zaHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEAVJREFUeJzt3X2MXNdZx/HvszWl4JaINmrQbohxty1BkZIQlRAoLxOC\niVMkXPEHiZO0dRDIQk1aQQtOKi271oJoJRBtUyByMXGLXFI1rUSQEhr3ZYRa6iSFvEHsxFlvN/Zu\nmqq0pcSowrUf/pixdzLxemY247njM9+PNNLcO2fufXx2/ds755y7G5mJJKksY1UXIEnqP8Ndkgpk\nuEtSgQx3SSqQ4S5JBTLcJalAHcM9InZGxHMR8dhp2nw4Ig5ExCMRcWl/S5Qk9aqbK/c7gatXejEi\nrgEmM/MNwFbgjj7VJklapY7hnplfAr59miabgI832z4AnBMR5/WnPEnSavRjzH0CONSyvdjcJ0mq\niBOqklSgNX04xiLw4y3b5zf3vUhE+ItsJGkVMjN6ad/tlXs0H6dyD/B2gIi4AvhOZj53mgJ9ZDI9\nPV15DcPysC/sC/vi9I/V6HjlHhGfAGrAayLiGWAaeHkjp3NHZt4bEW+JiKeBI8BNq6pEktQ3HcM9\nM6/vos3N/SlHktQPTqhWpFarVV3C0LAvltkXy+yLlyZWO56zqpNF5CDPJ0kliAjyDE2oSpLOIoa7\nJBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtS\ngQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXI\ncJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVqKtwj4iNEbE/Ip6KiG2neP01EXFf\nRDwSEY9HxJa+VypJ6lpk5ukbRIwBTwFXAUvAQ8B1mbm/pc008IrMvC0izgWeBM7LzO+3HSs7nU+S\n9EIRQWZGL+/p5sr9cuBAZi5k5lHgLmBTW5uvA69qPn8V8F/twS5JGpw1XbSZAA61bB+mEfitPgp8\nPiKWgFcC1/anPEnSanQT7t24DXg0M6+MiElgT0RcnJnPtzecmZk5+bxWq1Gr1fpUgiSVoV6vU6/X\nX9IxuhlzvwKYycyNze1bgczMD7S0uRf408z8cnP788C2zPxq27Ecc5ekHp2pMfeHgNdHxLqIeDlw\nHXBPW5t9wK82izgPeCNwsJdCJEn903FYJjOPRcTNwP00fhjszMx9EbG18XLuAP4MuDMiHgUC+KPM\n/NaZLFyStLKOwzJ9PZnDMpLUszM1LCNJOssY7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrsk\nFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCtTxD2RL\nOvPm5xeYmtrF4uJxJibGmJ3dwvr166ouS2cx/0C2VLH5+QU2bLidubntwFrgCJOT0+zZc4sBL8A/\nkC2dlaamdrUEO8Ba5ua2MzW1q8KqdLYz3KWKLS4eZznYT1jL0tLxKspRIQx3qWITE2PAkba9Rxgf\n97+nVs/vHqlis7NbmJycZjngG2Pus7NbKqtJZz8nVKUhcGK1zNLSccbHXS2jF1rNhKrhLklDztUy\nkiTAcJekIhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUBdhXtEbIyI/RHxVERsW6FNLSIejoj/\niIgv9rdMSVIvOt6hGhFjwFPAVcAS8BBwXWbub2lzDvCvwK9l5mJEnJuZ3zzFsbxDVZJ6dKbuUL0c\nOJCZC5l5FLgL2NTW5nrg05m5CHCqYJckDU434T4BHGrZPtzc1+qNwKsj4osR8VBEvK1fBUqSetev\nv6G6BrgM+BUaf3XgKxHxlcx8uk/HlyT1oJtwXwQuaNk+v7mv1WHgm5n5PeB7EfEvwCXAi8J9Zmbm\n5PNarUatVuutYkkqXL1ep16vv6RjdDOh+jLgSRoTqs8CDwKbM3NfS5sLgduBjcAPAg8A12bmE23H\nckJVknq0mgnVjlfumXksIm4G7qcxRr8zM/dFxNbGy7kjM/dHxGeBx4BjwI72YJckDY5/rEOShpx/\nrEOSBBjuklQkw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3\nSSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJek\nAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgboK94jY\nGBH7I+KpiNh2mnY/ExFHI+I3+1eiJKlXHcM9IsaAjwBXAxcBmyPiwhXavR/4bL+LlCT1ppsr98uB\nA5m5kJlHgbuATadodwtwN/CNPtYnSVqFbsJ9AjjUsn24ue+kiBgH3pqZfwNE/8qTJK1GvyZUPwi0\njsUb8JJUoTVdtFkELmjZPr+5r9WbgLsiIoBzgWsi4mhm3tN+sJmZmZPPa7UatVqtx5IlqWz1ep16\nvf6SjhGZefoGES8DngSuAp4FHgQ2Z+a+FdrfCfxTZn7mFK9lp/NJkl4oIsjMnkZEOl65Z+axiLgZ\nuJ/GMM7OzNwXEVsbL+eO9rf0UoAkqf86Xrn39WReuUtSz1Zz5e4dqpJUIMNdkgpkuEtSgQx3SSqQ\n4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnu\nklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5J\nBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoHWVF2ARtf8/AJTU7tYXDzOxMQYs7NbWL9+XdVl\nSUWIzOzcKGIj8EEaV/o7M/MDba9fD2xrbv4P8HuZ+fgpjpPdnE/lm59fYMOG25mb2w6sBY4wOTnN\nnj23GPBSm4ggM6OX93QclomIMeAjwNXARcDmiLiwrdlB4Jcy8xLgT4CP9lKERs/U1K6WYAdYy9zc\ndqamdlVYlVSObsbcLwcOZOZCZh4F7gI2tTbIzL2Z+d/Nzb3ARH/LVGkWF4+zHOwnrGVp6XgV5UjF\n6SbcJ4BDLduHOX14/w5w30spSuWbmBgDjrTtPcL4uHP8Uj/0dUI1Iq4EbgJ+YaU2MzMzJ5/XajVq\ntVo/S9BZYnZ2C3v3Tr9ozH129paKK5OqV6/XqdfrL+kYHSdUI+IKYCYzNza3bwXyFJOqFwOfBjZm\n5twKx3JCVSedWC2ztHSc8XFXy0grWc2Eajfh/jLgSeAq4FngQWBzZu5raXMB8HngbZm59zTHMtwl\nqUerCfeOwzKZeSwibgbuZ3kp5L6I2Np4OXcAU8Crgb+OiACOZublvf8TJEn90NU6976dzCt3SerZ\nGVnnLkk6+xjuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtS\ngQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXI\ncJekAhnuklQgw12SCrSm6gIGaX5+gampXSwuHmdiYozZ2S2sX7+u6rIkqe8iMwd3sogc5Plazc8v\nsGHD7czNbQfWAkeYnJxmz55bDHhJQy0iyMzo5T0jMywzNbWrJdgB1jI3t52pqV0VViVJZ8bIhPvi\n4nGWg/2EtSwtHa+iHEk6o0Ym3CcmxoAjbXuPMD4+Ml0gaYSMTLLNzm5hcnKa5YBvjLnPzm6prCZJ\nOlNGZkIVllfLLC0dZ3zc1TJSO1eULRumvljNhOpIhbuklbmibNmw9MWJHzC7d8+cmdUyEbExIvZH\nxFMRsW2FNh+OiAMR8UhEXNpLEZKqNywryubnF7jxxu1ceeU0N964nfn5hYGeH4ajL078gNm9+72r\nen/HcI+IMeAjwNXARcDmiLiwrc01wGRmvgHYCtyx0vGq+mINixPfuJde+o5K+2IY/gOdUK/XKzs3\n2BcnDMOKstZAq9evZPfu97Jhw+0D/5oMQ1+8+AdMjzLztA/gCuC+lu1bgW1tbe4Arm3Z3gecd4pj\nJTyfk5PvyYMHv5aj5uDBr+Xk5HsSnk+YrqwvXlhHVv41mZ6eruS8mfZFqxtumGnphzzZHzfcMFNR\nDdOV1PDiOqrpi1rtj1vOTWaHrG5/dDMsMwEcatk+3Nx3ujaLp2jTNLo3Dw3DR71hqmMY2BfLhmFF\n2TBcMcNw9MWpl293r6LfLTOaNw8NyzfusNQxDOyLZevXr2PPnluYmvrzlhVlg51AXA601q/J4O9H\nGYa+mJ3dwt69082Lj951XC0TEVcAM5m5sbl9K42PCB9oaXMH8MXM/GRzez/wy5n5XNuxXCojSauQ\nPa6W6ebK/SHg9RGxDngWuA7Y3NbmHuCdwCebPwy+0x7sqylOkrQ6HcM9M49FxM3A/TRW1+zMzH0R\nsbXxcu7IzHsj4i0R8TSNz1Q3ndmyJUmnM9CbmCRJgzGwWYpuboQaBRFxfkR8ISL+MyIej4h3VV1T\nlSJiLCL+PSLuqbqWqkXEORHxqYjY1/z++Nmqa6pCRNzW/Pc/FhG7I+LlVdc0SBGxMyKei4jHWvb9\naETcHxFPRsRnI+KcTscZSLh3cyPUCPk+8AeZeRHwc8A7R7gvAN4NPFF1EUPiQ8C9mflTwCU07hcZ\nKc25vd8FfjozL6YxdHxdtVUN3J00srLVrcDnMvMngS8At3U6yKCu3C8HDmTmQmYeBe4CNg3o3EMl\nM7+emY80nz9P4z/wCvcElC0izgfeAvxt1bVULSJ+BPjFzLwTIDO/n5nfrbisKnwX+D9gbUSsAX4Y\nWKq2pMHKzC8B327bvQn4WPP5x4C3djrOoMK9mxuhRk5E/ARwKfBAtZVU5i+BPwSc+IH1wDcj4s7m\nMNWOiPihqosatMz8NvAXwDM0bob8TmZ+rtqqhsJrT6xAzMyvA6/t9IaR+X3uwyYiXgncDby7eQU/\nUiLi14Hnmp9iovkYZWuAy4C/yszLgP+l8VF8pETE64DfB9YB48ArI+L6aqsaSh0viAYV7ovABS3b\n5zf3jaTmx827gb/PzH+sup6KvBn4jYg4CPwDcGVEfLzimqp0GDiUmV9tbt9NI+xHzZuAL2fmtzLz\nGPAZ4OcrrmkYPBcR5wFExI8B3+j0hkGF+8kboZoz39fRuPFpVP0d8ERmfqjqQqqSme/LzAsy83U0\nvh++kJlvr7quqjQ/ch+KiDc2d13FaE40PwlcERGviIig0Q8jN7HMiz/N3gNsaT5/B9DxonAgv1tm\npRuhBnHuYRMRbwZuAB6PiIdpfLx6X2b+c7WVaQi8C9gdET8AHGQEbwbMzEebn+D+DTgGPAzsqLaq\nwYqITwA14DUR8QwwDbwf+FRE/DawAPxWx+N4E5MklccJVUkqkOEuSQUy3CWpQIa7JBXIcJekAhnu\nklQgw12SCmS4S1KB/h+FQThUEJWxmgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(dt.feature_importances_, 'o')\n", + "plt.ylim(0,1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use [this tip for getting the column names from your cursor object](http://stackoverflow.com/questions/10252247/how-do-i-get-a-list-of-column-names-from-a-psycopg2-cursor)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#alternative:\n", + "\n", + "df = pd.read_sql('SELECT * FROM winequality', conn)\n", + "wine = df.as_matrix()\n", + "\n", + "x = wine[:,:-1]\n", + "y = wine[:,-1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 773757f2db0e5ee4e6bc78595b3e963aac9f712a Mon Sep 17 00:00:00 2001 From: S NAJMABADI Date: Mon, 15 Aug 2016 09:01:38 -0400 Subject: [PATCH 2/2] adding hw9 --- class9/homework/najmabadi_shannon_9_1.ipynb | 153 ++++++++++++++ class9/homework/najmabadi_shannon_9_2.ipynb | 211 ++++++++++++++++++++ 2 files changed, 364 insertions(+) create mode 100644 class9/homework/najmabadi_shannon_9_1.ipynb create mode 100644 class9/homework/najmabadi_shannon_9_2.ipynb diff --git a/class9/homework/najmabadi_shannon_9_1.ipynb b/class9/homework/najmabadi_shannon_9_1.ipynb new file mode 100644 index 0000000..c7e9f66 --- /dev/null +++ b/class9/homework/najmabadi_shannon_9_1.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assignment 1\n", + "\n", + "Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into 5 equal-sized sets\n", + "Don't forget to shuffle the input before assigning to sets\n", + "You can use the fit(), predict(), and score() functions of your model in your functions\n", + "Test the results with the sklearn cross_val_score\n", + "In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "from sklearn import tree\n", + "from sklearn import metrics\n", + "from sklearn import datasets\n", + "from sklearn.cross_validation import cross_val_score\n", + "\n", + "iris = datasets.load_iris()\n", + "x = iris.data[:,2:]\n", + "y = iris.target" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "a = list(zip(x,y)) #Zip the dataa\n", + "random.shuffle(a) #Shuffle the data\n", + "x,y = zip(*a) #Unzip the data (*)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Array 1: score of 0.933333333333\n", + "Array 2: score of 0.966666666667\n", + "Array 3: score of 0.9\n", + "Array 4: score of 0.966666666667\n", + "Array 5: score of 0.933333333333\n" + ] + } + ], + "source": [ + "number_of_splices = 5 #Say how many splices we're dividing it into\n", + "list_length = len(a) #Make a variable name for the length of the list \n", + "splice_size = int(list_length / number_of_splices) #Set a variable for the size of each splice\n", + "\n", + "for i in range(1, number_of_splices + 1): #Loop through the splices\n", + " x_test = x[int(splice_size) * (i-1): int(splice_size * i)] #Separate out testing data. If we break this down, we have int: int, which is making smaller arrays in the size of [int, inclusive: int, exclusive]. The ints say where the index should start and end. In the first int, we multiply the splice size by i-1 because we want to start at index 0. \n", + " y_test = y[int(splice_size) * (i-1): int(splice_size * i)] \n", + " x_train = x[0: int(splice_size * (i-1))] + x[int(splice_size * i): int(list_length)- 1] #Separate out training data\n", + " y_train = y[0: int(splice_size * (i-1))] + y[int(splice_size * i): int(list_length) - 1] \n", + "\n", + " dt = tree.DecisionTreeClassifier().fit(x_train, y_train) \n", + " \n", + " y_pred = dt.predict(x_test) \n", + " score = metrics.accuracy_score(y_test, y_pred)\n", + " print(\"Array \" + str(i) + \": score of \" + str(score))" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9447619047619048" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(score_list) #Get the average score" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.95333333333333348" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores = cross_val_score(dt,x,y,cv=5) #Test it using cross validation function\n", + "np.mean(scores) #Compare score means" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/class9/homework/najmabadi_shannon_9_2.ipynb b/class9/homework/najmabadi_shannon_9_2.ipynb new file mode 100644 index 0000000..d37d9e8 --- /dev/null +++ b/class9/homework/najmabadi_shannon_9_2.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assignment 2\n", + "\n", + "Using the readings, try and create a RandomForestClassifier for the iris dataset\n", + "Using a 25/75 training/test split, compare the results with the original decision tree model and describe the result to the best of your ability in your PR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn import tree\n", + "from sklearn import metrics\n", + "from sklearn import datasets\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.cross_validation import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "iris = datasets.load_iris()\n", + "x = iris.data[:,2:]\n", + "y = iris.target" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,\n", + " oob_score=False, random_state=42, verbose=0, warm_start=False)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,train_size=0.75)\n", + "forest = RandomForestClassifier(n_estimators=5, random_state=42)\n", + "forest.fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set accuracy: 1.000000\n", + "Testing set accuracy: 0.947368\n" + ] + } + ], + "source": [ + "print(\"Training set accuracy: %f\" % forest.score(x_train, y_train))\n", + "print(\"Testing set accuracy: %f\" % forest.score(x_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, train_size=0.25)\n", + "dt = tree.DecisionTreeClassifier()\n", + "dt = dt.fit(x_train, y_train)\n", + "\n", + "def measure_performance(x,y,dt, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):\n", + " y_pred=dt.predict(x)\n", + " if show_accuracy:\n", + " print(\"Accuracy:{0:.3f}\".format(metrics.accuracy_score(y, y_pred)),\"\\n\")\n", + " if show_classification_report:\n", + " print(\"Classification report\")\n", + " print(metrics.classification_report(y,y_pred),\"\\n\")\n", + " if show_confussion_matrix:\n", + " print(\"Confusion matrix\")\n", + " print(metrics.confusion_matrix(y,y_pred),\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set:\n", + "Accuracy:1.000 \n", + "\n", + "Classification report\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 12\n", + " 1 1.00 1.00 1.00 12\n", + " 2 1.00 1.00 1.00 13\n", + "\n", + "avg / total 1.00 1.00 1.00 37\n", + " \n", + "\n", + "Confusion matrix\n", + "[[12 0 0]\n", + " [ 0 12 0]\n", + " [ 0 0 13]] \n", + "\n" + ] + } + ], + "source": [ + "print(\"Training set:\") \n", + "measure_performance(x_train, y_train,dt)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing set:\n", + "Accuracy:0.974 \n", + "\n", + "Classification report\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 13\n", + " 1 0.93 1.00 0.96 13\n", + " 2 1.00 0.92 0.96 12\n", + "\n", + "avg / total 0.98 0.97 0.97 38\n", + " \n", + "\n", + "Confusion matrix\n", + "[[13 0 0]\n", + " [ 0 13 0]\n", + " [ 0 1 11]] \n", + "\n" + ] + } + ], + "source": [ + "print(\"Testing set:\")\n", + "measure_performance(x_test,y_test,dt)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}