-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathml.py
More file actions
55 lines (45 loc) · 1.83 KB
/
ml.py
File metadata and controls
55 lines (45 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
"""
Python version: 3.7.3
"""
#%%load packages for LDA
import numpy as np
#9f6abe9f34efad365694a3ae52d244ebfdc22429
import panda as pd
import system as sys
#%% LDA
def lda_pitchdecks(text_list, number_topics,number_words,counts=[0]):
'''
output: the LDA of the text_list with the certain data preprocessing.
counts[0] is the number of times this function is called as a index of the output file
file is saving in the C:\dev\doc\etap_platform
'''
# Initialise the count vectorizer
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
stop_words = 'Dutch',
lowercase = False,
>>>>>>> 755c7779b527c2ea3dfdb9dbf5e6065d7e0cdea8
token_pattern = r'\b[a-zA-Z]{3,}\b')
# generate word counts
dtm_tf = tf_vectorizer.fit_transform(text_list)
# Create and fit the LDA model
lda = LDA(n_components=number_topics, random_state=0)
lda.fit(dtm_tf)
# Print the topics found by the LDA model
print("Topics found via LDA:")
visualization.print_topics(lda, tf_vectorizer, number_words)
dtm_output=pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vectorizer)
outpath = r'C:\dev\doc\etap_platform'
print('the output of the html file is in the following location',outpath)
if not os.path.exists(outpath):
os.makedirs(outpath)
#run_date=date.today()
counts[0]+=1
file_path = os.path.join(outpath, 'LDA_{}.html'.format(counts[0]))
pyLDAvis.save_html(dtm_output,file_path)
<<<<<<< HEAD
def text_to_wordlist(text):
review=normalization_word2vec(text)
words = review.lower().split()
return words=======
>>>>>>> parent of 5bf12f8... add text_to_wordlist function at master branch