diff --git a/class9/homework/Emelike_Mercy_9_1.ipynb b/class9/homework/Emelike_Mercy_9_1.ipynb new file mode 100644 index 0000000..a8605ed --- /dev/null +++ b/class9/homework/Emelike_Mercy_9_1.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into 5 equal-sized sets\n", + "- Don't forget to shuffle the input before assigning to sets\n", + "- You can use the fit(), predict(), and score() functions of your model in your functions\n", + "- Test the results with the sklearn cross_val_score\n", + "\n", + "In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "iris = datasets.load_iris()\n", + "x = iris.data[:,2:] # the attributes\n", + "y = iris.target # the target variables\n", + "dt = tree.DecisionTreeClassifier()\n", + "dt = dt.fit(x,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def cv(model, attributes, target, folds):\n", + " \n", + "#import stuff \n", + " import numpy as np\n", + " import random\n", + " from sklearn import datasets\n", + " from sklearn import tree\n", + " from sklearn.utils import shuffle\n", + " \n", + "# print('the entire data set is', len(attributes), 'rows long')\n", + " len_subset1 = len(attributes)//folds #30\n", + "# print('we have', folds, 'subsets.')\n", + "# print('each subset is', len_subset1, 'rows long')\n", + " len_subset2 = len_subset1//folds #6\n", + "# print('each subset is divide into', len_subset2, 'sections')\n", + " \n", + " scores = []\n", + "\n", + " n = 0\n", + " while n<(folds): \n", + " for a in range(0,folds):\n", + "\n", + " n=n+1\n", + " \n", + "# print('loop:', a) \n", + " \n", + "# print('shuffle the attributes and target classifications together. select seed randomly')\n", + " attributes_s, target_s = shuffle(attributes, target, random_state=random.seed())\n", + " \n", + "# print('link the attributes and targets in a list')\n", + " attr_targ = list(zip(attributes_s, target_s))\n", + " \n", + "# print('list length is', len(attr_targ))\n", + " \n", + " if len(attr_targ) > 30:\n", + "# print('list length greater than 30')\n", + "# print('assign the first', len_subset1, 'rows to the subset')\n", + " attr_targ_subset = attr_targ[(len_subset1):] # 150 rows; 120 rows, 90 rows\n", + "\n", + " \n", + "# print('**remove the subset from the list by starting at row', (n)*len_subset1, 'leaving', len(attributes_s)-len_subset1)\n", + " attr_targ = attr_targ[len_subset1:]\n", + "\n", + "# print('**empty attributes and targets so we can reassign')\n", + " attributes = []\n", + " target = []\n", + "\n", + "# print('**assign attributes and targets from the shortened list to their respective lists')\n", + " for x,y in attr_targ: \n", + " attributes.append(x)\n", + " target.append(y)\n", + "\n", + "# print('now the list length is', len(attributes))\n", + "\n", + "# print('assign training data from row', len_subset2, 'to', folds*len_subset2, 'of the subset')\n", + " train = attr_targ_subset[len_subset2:(folds*len_subset2)] # training, assigned from row 6 to 30 of subset\n", + "\n", + "\n", + "# print('create empty lists for training attributes and training targets')\n", + " train_attr = []\n", + " train_targ = []\n", + "\n", + "# print('append separated elements to respective lists: train_attr or train_targ')\n", + " for x,y in train: \n", + " train_attr.append(x)\n", + " train_targ.append(y)\n", + "# print('train_attr length is', len(train_attr))\n", + "# print('train_targ length is', len(train_targ))\n", + "\n", + "# print('assign testing data from row', 0, 'to', len_subset2, 'of the subset')\n", + " test = attr_targ_subset[0:len_subset2] # test assigned from row 0 to 6 of subset\n", + "\n", + "# print('create empty lists for test attributes and test targets')\n", + " test_attr = []\n", + " test_targ = []\n", + "\n", + "# print('append separated elements to respective lists: test_attr or test_targ')\n", + " for x,y in test: \n", + " test_attr.append(x)\n", + " test_targ.append(y)\n", + "# print('test_attr length is', len(test_attr))\n", + "# print('test_targ length is', len(test_targ))\n", + "\n", + "\n", + "# print('fit model on training data')\n", + " model_ = model.fit(train_attr,train_targ)\n", + "\n", + "# print('validate model on test data')\n", + " predict = model_.predict(test_attr)\n", + "# print('attributes:', test_attr, 'predictions:', predict, 'true values:', test_targ)\n", + "\n", + " score = model_.score(test_attr,test_targ)\n", + "# print('model score is:', score) \n", + "\n", + " scores.append(score)\n", + "# print('append score to list:', scores)\n", + "\n", + "\n", + "# print('validation complete on subset',n)\n", + "\n", + " \n", + " else: \n", + "# print('list length less than 30')\n", + "# print('assign training data from row', len_subset2, 'to', len(attr_targ), 'of the subset')\n", + "\n", + " train = attr_targ_subset[len_subset2:len(attr_targ)] # training, assigned from row 6 to 30 of subset\n", + "\n", + "\n", + "# print('create empty lists for training attributes and training targets')\n", + " train_attr = []\n", + " train_targ = []\n", + "\n", + "# print('append separated elements to respective lists: train_attr or train_targ')\n", + " for x,y in train: \n", + " train_attr.append(x)\n", + " train_targ.append(y)\n", + "# print('train_attr length is', len(train_attr))\n", + "# print('train_targ length is', len(train_targ))\n", + "\n", + "# print('assign testing data from row', 0, 'to', len_subset2, 'of the subset')\n", + " test = attr_targ_subset[0:len_subset2] # test assigned from row 0 to 6 of subset\n", + "\n", + "# print('create empty lists for test attributes and test targets')\n", + " test_attr = []\n", + " test_targ = []\n", + "\n", + "# print('append separated elements to respective lists: test_attr or test_targ')\n", + " for x,y in test: \n", + " test_attr.append(x)\n", + " test_targ.append(y)\n", + "# print('test_attr length is', len(test_attr))\n", + "# print('test_targ length is', len(test_targ))\n", + "\n", + "\n", + "# print('fit model on training data')\n", + " model_ = model.fit(train_attr,train_targ)\n", + "\n", + "# print('validate model on test data')\n", + " predict = model_.predict(test_attr)\n", + "# print('attributes:', test_attr, 'predictions:', predict, 'true values:', test_targ)\n", + "\n", + " score = model_.score(test_attr,test_targ)\n", + "# print('model score is:', score) \n", + "\n", + " scores.append(score)\n", + "# print('append score to list:', scores)\n", + "\n", + "\n", + "# print('validation complete on subset',n)\n", + "\n", + " \n", + " accuracy_score = np.mean(scores)\n", + " \n", + " return accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.96666666666666679" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv(dt,x,y,5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Testing with cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.cross_validation import cross_val_score" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "scores = cross_val_score(dt,x,y,cv=5) " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.96000000000000019" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(scores)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.1" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/class9/homework/Emelike_Mercy_9_2.ipynb b/class9/homework/Emelike_Mercy_9_2.ipynb new file mode 100644 index 0000000..21a1086 --- /dev/null +++ b/class9/homework/Emelike_Mercy_9_2.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Using the readings, try and create a RandomForestClassifier for the iris dataset\n", + "Using a 75/25 training/test split, compare the results with the original decision tree model and describe the result to the best of your ability in your PR" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import random\n", + "from sklearn import datasets\n", + "from sklearn import tree\n", + "from sklearn.utils import shuffle" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "iris = datasets.load_iris()" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# n_estimators, number of trees in the forest\n", + "def rf(ds, n_estimators): \n", + " \n", + " n = 0\n", + " trees = []\n", + " scores = []\n", + " \n", + " while n 1]\n", + "labels = [t[1] for t in training if len(t) > 1]\n", + "\n", + "# A little bit of cleanup for scikit-learn's benefit. Scikit-learn models wants our categories to\n", + "# be numbers, not strings. The LabelEncoder performs this transformation.\n", + "# Assigns LabelEncoder to variable, encoder\n", + "encoder = preprocessing.LabelEncoder()\n", + "# Fits encoder to labels list. Returns an array. \n", + "correct_labels = encoder.fit_transform(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "########## STEP 2: FEATURE EXTRACTION ##########\n", + "\n", + "# CountVectorizer implements both tokenization and occurrence counting in a single class\n", + "# Tokenization is the process of rocess of breaking a stream of text up into words, \n", + "# phrases, symbols, or other meaningful elements called tokens \n", + "# The CountVectorizer gives a token id for each possible token, for instance using white space and punctuation as token\n", + "# separators. \n", + "# Occurrence counting is counting the occurrence of tokens in a document \n", + "# Stop_words is a parameter of the CountVectorizer. When set to 'english' a built-in stop word list for English is used\n", + "# Here, we are assigning the countvectorizer to a variable called vectorizer\n", + "vectorizer = CountVectorizer(stop_words='english')\n", + "\n", + "# The fit_transform method learns the vocabulary dictionary and returns term-document matrix (array)\n", + "# The array has samples (which are lines of text separated by stop words) as its rows and features as it columns\n", + "# The array is filled with 0/1, 0 this feature is not in the sample, 1 it is. \n", + "# Here, we fit the vectorizer on the text list. The vectorizer returns an array. \n", + "# We assign this array to a variable called data\n", + "data = vectorizer.fit_transform(text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "########## STEP 3: MODEL BUILDING ##########\n", + "\n", + "# DecisionTreeClassifier() that predicts the value of a target variable by learning simple decision rules inferred from \n", + "# the data features\n", + "# Assign classifer to variable, model\n", + "model = DecisionTreeClassifier()\n", + "\n", + "# Fit the model on the features array to predict the correct labels \n", + "fit_model = model.fit(data, correct_labels)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.\n", + " % (min_labels, self.n_folds)), Warning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.65 (+/- 0.05)\n" + ] + } + ], + "source": [ + "# ########## STEP 4: EVALUATION ##########\n", + "\n", + "# Evaluate our model with 10-fold cross-validation\n", + "# This means that we split the data into five sections \n", + "# Then we split those five sections into five sections \n", + "# Train on four sections, test on one section, repeat\n", + "# save scores to variable \n", + "\n", + "scores = cross_validation.cross_val_score(model, data, correct_labels, cv=5)\n", + "\n", + "# Print the mean and standard deviation of the scores from cross validation\n", + "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Public postsecondary education: executive officer compensation. -> ['Education']\n", + "An act to add Section 236.3 to the Education code, related to the pricing of college textbooks. -> ['Education']\n", + "Political Reform Act of 1974: campaign disclosures. -> ['Campaign Finance and Election Issues']\n", + "An act to add Section 236.3 to the Penal Code, relating to human trafficking. -> ['Crime']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n", + " DeprecationWarning)\n", + "/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n", + " DeprecationWarning)\n", + "/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n", + " DeprecationWarning)\n", + "/Users/mercyemelike/.virtualenvs/data_analysis/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n", + " DeprecationWarning)\n" + ] + } + ], + "source": [ + "# ########## STEP 5: APPLYING THE MODEL ##########\n", + "\n", + "# Samples to use in the model\n", + "docs_new = [\"Public postsecondary education: executive officer compensation.\",\n", + " \"An act to add Section 236.3 to the Education code, related to the pricing of college textbooks.\",\n", + " \"Political Reform Act of 1974: campaign disclosures.\",\n", + " \"An act to add Section 236.3 to the Penal Code, relating to human trafficking.\"\n", + " ]\n", + "\n", + "# Apply vectorizer to the test samples, save to variable, test_data\n", + "test_data = vectorizer.transform(docs_new)\n", + "\n", + "# Loop through elements of docs_new \n", + "# Print, string assigment, row i in docs_new --> string resulting from calling the labelencoder (assigned in step 1) \n", + "# with classes attribute\n", + "# The classes attribute holds labels for each class\n", + "# Pass predictions of labels for test_data into the encoder\n", + "for i in range(len(docs_new)):\n", + " print('%s -> %s' % (docs_new[i], encoder.classes_[model.predict(test_data.toarray()[i])]))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.1" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}