forked from AICoE/pytorch-lda2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
100 lines (79 loc) · 3.26 KB
/
utils.py
File metadata and controls
100 lines (79 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch as t
import torch.nn.functional as F
import numpy as np
import spacy
import os
from tqdm import tqdm
from gensim import corpora, models
def get_sparsity_score(vec):
"""
Get Sparsity Score
Computes an normalized sparsity score of a vector of proportions
that sum to one.
:param vec: Tensor - one dimensional
:returns: Float - Normalized score
"""
K = np.prod(np.array(vec.size()))
uniform_vec = t.tensor([1/K for _ in range(K)]).float()
max_sparsity = 2 * ((K - 1) / K)
norm_score = sum(abs(vec.float().to('cpu') - uniform_vec.float())) / max_sparsity
return norm_score.item()
def get_topics(word_embeds, topic_vectors, vocab):
topics = []
for i, topic in enumerate(topic_vectors):
# Get 10 closest word_embeds
top_10 = get_n_closest_vectors(topic, word_embeds.weight)
topics.append(" ".join([vocab[vec] for vec in top_10]))
return topics
def get_n_closest_vectors(vec, vector_table, n=10):
dist = F.cosine_similarity(vector_table, vec.unsqueeze(dim=1).transpose(0, 1))
index_sorted = dist.argsort()
return index_sorted[:n]
#dot = (vector_table * vec).sum(-1)
#prob = F.softmax(dot).argsort().detach().cpu().numpy()[::-1]
#return prob[:n]
def get_pretrained_vecs(dataset):
"""
Build word embedding weights based on pretrained vectors
:params dataset: Dataset to base vocab on
:params nlp: A spaCy NLP pipeline with pretrained vectors - md or lg
:returns: A tensor of size: vocab_len x embed_len
"""
print('Loading Pretrained Vectors...')
nlp = spacy.load('en_core_web_md')
vocab = list(dataset.term_freq_dict.keys())
vectors = [nlp.vocab[v].vector for v in vocab]
return t.tensor(vectors)
def get_doc_vecs_lda_initialization(dataset):
"""
Runs standard LDA on tokenized docs in dataset
:params dataset: Dataset to get documents from
:returns: A tensor of size: num_docs x num_topics
"""
print('Using LDA Document Intializations...')
save_init_file = f'{dataset._get_saved_ds_dir()}lda-doc-init.pth'
if os.path.exists(save_init_file):
# Data already exists - load it!
print('Loading saved lda init file...')
return t.load(save_init_file)
# Build inputs for LDA
dictionary = corpora.Dictionary(dataset.tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in dataset.tokenized_docs]
# Run LDA and get resulting proportions
lda = models.LdaModel(corpus, alpha=0.9, id2word=dictionary, num_topics=dataset.args.num_topics)
corpus_lda = lda[corpus]
# View topics generated
for i, topics in lda.show_topics(dataset.args.num_topics, formatted=False):
print('topic', i, ':', ' '.join([t for t, _ in topics]))
# Build tensor to initialize from
doc_weights_init = np.zeros((len(corpus_lda), dataset.args.num_topics))
for doc in tqdm(range(len(corpus_lda))):
topics = corpus_lda[doc]
for top, prob in topics:
doc_weights_init[doc, top] = prob
# Convert to tensor and save
doc_weights_init = t.from_numpy(doc_weights_init).float()
print('Saving LDA doc weights init...')
t.save(doc_weights_init, save_init_file)
print('Saved LDA doc weight init')
return doc_weights_init