optimization-/classifierOpt.py at master · mishugeb/optimization- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 20 16:49:34 2016

@author: Ashiqul
"""
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from Bio import SeqIO
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from feature_extraction import extract_motifs_pos
from sklearn.model_selection import ShuffleSplit


def classifier(maxWordLength, maxDistLength, wordMotifMaxBuffer, distMotifMaxBuffer):
	corpus1=[]#make a blank list of corpus for the training set
	tag=[]#make the list of outcomes
	true = open("ch.fasta")
	false = open("nonch.fasta")
	for line in SeqIO.parse(true, "fasta"):
	    line = line.seq.tostring().lower().replace("x","")
	    line = line.replace('-', "")
	   # print line
	    tag.append("1")
	    fullstring = extract_motifs_pos(line, 1, maxWordLength, 1, maxDistLength, wordMotifMaxBuffer, distMotifMaxBuffer)
	    #fullstring = fullstring+ " "+ pos_prot_1st_word(line)
	    corpus1.append(fullstring) #apperd string from each protein to corpus
	true.close()
	for line in SeqIO.parse(false, "fasta"):
	    line = line.seq.tostring().lower().replace("x","")
	    line = line.replace('-', "")
	    #print line
	    tag.append("0")
	    fullstring = extract_motifs_pos(line, 1, maxWordLength, 1, maxDistLength, wordMotifMaxBuffer, distMotifMaxBuffer)
	    #fullstring = fullstring+ " "+ pos_prot_1st_word(line)
	    corpus1.append(fullstring) #apperd string from each protein to corpus
	false.close()

	corpus = np.array(corpus1) #convert corpus into numpy array
	tag = np.array(tag)  # convert tag into numpy array
	#print corpus # print for debugging
	#print tag # print for debugging

	count = CountVectorizer(max_features=15000000, vocabulary = None, max_df=0.3, min_df = 3, stop_words=[1,2])#giving the CountVectorizer function a short name
	#get the vocabulary of train set to use for the test set


	bag = count.fit_transform(corpus) #transform the corpus(bag of words into sparse martrix)
	#print (count.vocabulary_) #count the occurence of number of words
	##get the vocabulary of train set to use for the test set. Next time put the "voc" in
	#the vocabulary parameter of count
	voc = count.get_feature_names()
	#print len(voc)
	bag= bag.toarray() #convert the sparsematrix into an array
	np.place(bag, bag>0, [1])
	#print bag

	forest = RandomForestClassifier(n_estimators = 1000,
	                                random_state = 1,
	                                n_jobs =1)
	forest.fit(bag[:, 0:-1], tag)
	importances = forest.feature_importances_
	std = np.std([tree.feature_importances_ for tree in forest.estimators_],
	             axis=0)
	indices = np.argsort(importances)[::-1]

	# Print the feature ranking
	#print("Feature ranking:")
	important = list()
	for f in range(0,500):
	    important.append(indices[f])
	bag=bag[:,important]
	bag=pd.DataFrame(bag)
	bag['tag']=tag
	voc = np.array(voc)[important]
	x = bag.iloc[:, 0:-1]
	y = bag.iloc[:,-1]
	#parameterize the Logistic Regression algorithm
	cv = ShuffleSplit(n_splits=10, test_size=0.2)
	clf= MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15), random_state=1)
	return np.mean(cross_val_score(clf, x, y, cv =cv))

def main():
	score = []
	for maxWordLength in range(2, 40):
		score.append(classifier(maxWordLength, 15, 1, 0))
	print("maxWordLength 2 - 40:")
	print(score)
	score = []
	for maxDistLength in range(2,40):
		score.append(classifier(15, maxDistLength, 1, 0))
	print("maxDistLength 2-40")
	print(score)
	score = []
	for wordMotifMaxBuffer in range(0, 10):
		score.append(classifier(15, 15, wordMotifMaxBuffer, 0))
	print("wordMotifMaxBuffer 0-10")
	print(score)
	score = []
	for distMotifMaxBuffer in range(0, 10):
		score.append(classifier(15, 15, 1, distMotifMaxBuffer))
	print("distMotifMaxBuffer 0-10")
	print(score)

if __name__ == "__main__":
	main()