-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLanguageProcessing.py
More file actions
96 lines (76 loc) · 3.25 KB
/
LanguageProcessing.py
File metadata and controls
96 lines (76 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
text = 'This is my test text. We\'re keeping this text short to keep things manageable.'
def count_words(text):
'''Count number of times a word shows up in a string.
Return dictionary where keys are unique words and values are word counts.
Skips punctuations and doesn't care about capitalization.'''
text = text.lower()
skip = ['.',',',':',';','"','\'']
for punc in skip:
text = text.replace(punc, '')
word_counts = {}
for word in text.split(' '):
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
return word_counts
count_words(text)
from collections import Counter
def count_words_fast(text):
'''Count number of times a word shows up in a string using counter module.
Return dictionary where keys are unique words and values are word counts.
Skips punctuations and doesn't care about capitalization.'''
text = text.lower()
skip = ['.',',',':',';','"','\'']
for punc in skip:
text = text.replace(punc, '')
word_counts = Counter(text.split(' '))
return word_counts
def read_book(title_path):
''' Read book and return it as a string'''
with open(title_path, 'r', encoding='utf8') as current_file:
text = current_file.read()
text = text.replace('\n','').replace('\r','')
return text
def word_stats(word_counts):
''' Return number of unique words and word frequencies for each of those'''
num_unique = len(word_counts)
counts = word_counts.values()
return (num_unique, counts)
text = read_book('./Books/English/shakespeare/Romeo and Juliet.txt')
word_counts = count_words(text)
(num_unique, counts) = word_stats(word_counts)
text = read_book('./Books/German/shakespeare/Romeo und Julia.txt')
word_counts = count_words(text)
(num_unique, counts) = word_stats(word_counts)
import os
import pandas as pd
book_dir = './Books'
stats = pd.DataFrame(columns=('language','author','title','length','unique'))
title_num = 1
for language in os.listdir(book_dir):
for author in os.listdir(book_dir + '/' + language):
for title in os.listdir(book_dir + '/' + language + '/' + author):
inputfile = book_dir + '/' + language + '/' + author + '/' + title
text = read_book(inputfile)
(num_unique, counts) = word_stats(count_words(text))
stats.loc[title_num] = language, author.capitalize(), title.replace('.txt',''), sum(counts), num_unique
title_num += 1
from matplotlib import pyplot
pyplot.plot(stats.length, stats.unique, 'bo')
pyplot.savefig('length vs unique.jpg')
pyplot.show()
pyplot.figure(figsize=(10,10))
subset = stats[stats.language=='English']
pyplot.loglog(subset.length, subset.unique, 'o', label='English', color='crimson')
subset = stats[stats.language=='French']
pyplot.loglog(subset.length, subset.unique, 'o', label='French', color='forestgreen')
subset = stats[stats.language=='German']
pyplot.loglog(subset.length, subset.unique, 'o', label='German', color='orange')
subset = stats[stats.language=='Portuguese']
pyplot.loglog(subset.length, subset.unique, 'o', label='Portuguese', color='blueviolet')
pyplot.legend()
pyplot.xlabel('Book length')
pyplot.ylabel('Number of unique words')
pyplot.savefig('Language plot.jpg')
pyplot.show()