Learning-git-and-github/ml.py at master · weiweivv2222/Learning-git-and-github · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
"""
Python version: 3.7.3
"""

#%%load packages for LDA

import numpy as np
#9f6abe9f34efad365694a3ae52d244ebfdc22429
import panda as pd
import system as sys


#%% LDA
def lda_pitchdecks(text_list, number_topics,number_words,counts=[0]):
    '''
    output: the LDA of the text_list with the certain data preprocessing.
            counts[0] is the number of times this function is called as a index of the output file
            file is saving in the C:\dev\doc\etap_platform
    '''
    # Initialise the count vectorizer
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'Dutch',
                                lowercase = False,
>>>>>>> 755c7779b527c2ea3dfdb9dbf5e6065d7e0cdea8
                                token_pattern = r'\b[a-zA-Z]{3,}\b')
    # generate word counts
    dtm_tf = tf_vectorizer.fit_transform(text_list)

    # Create and fit the LDA model
    lda = LDA(n_components=number_topics, random_state=0)
    lda.fit(dtm_tf)

    # Print the topics found by the LDA model
    print("Topics found via LDA:")
    visualization.print_topics(lda, tf_vectorizer, number_words)
    dtm_output=pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vectorizer)

    outpath = r'C:\dev\doc\etap_platform'
    print('the output of the html file is in the following location',outpath)
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    #run_date=date.today()
    counts[0]+=1
    file_path = os.path.join(outpath, 'LDA_{}.html'.format(counts[0]))
    pyLDAvis.save_html(dtm_output,file_path)
<<<<<<< HEAD


def text_to_wordlist(text):
    review=normalization_word2vec(text)
    words = review.lower().split()
    return words=======
>>>>>>> parent of 5bf12f8... add text_to_wordlist function at master branch