diff --git a/README.rst b/README.rst index 6e59ec6..fe4ea86 100644 --- a/README.rst +++ b/README.rst @@ -1,13 +1,18 @@ =========== autocorrect =========== + Python 3 Spelling Corrector +Supports English and Bengali +======= +Python 3 Spelling Corrector for bengali + Installation ============ .. code-block:: bash - pip install autocorrect + pip install https://github.com/SubrataSarkar32/autocorrect.git Examples ======== @@ -16,6 +21,8 @@ Examples >>> from autocorrect import spell >>> spell('HTe') 'The' + #for bengali + >>> spell('কখন',language='bn') Contribute ========== diff --git a/autocorrect.egg-info/PKG-INFO b/autocorrect.egg-info/PKG-INFO new file mode 100644 index 0000000..279a4a2 --- /dev/null +++ b/autocorrect.egg-info/PKG-INFO @@ -0,0 +1,16 @@ +Metadata-Version: 2.1 +Name: autocorrect +Version: 0.3.0 +Summary: Python 3 Spelling Corrector +Home-page: https://github.com/phatpiglet/autocorrect/ +Author: Jonas McCallum +Author-email: jonasmccallum@gmail.com +License: http://www.opensource.org/licenses/mit-license.php +Keywords: autocorrect spelling corrector +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +License-File: LICENSE diff --git a/autocorrect.egg-info/SOURCES.txt b/autocorrect.egg-info/SOURCES.txt new file mode 100644 index 0000000..1fe11ab --- /dev/null +++ b/autocorrect.egg-info/SOURCES.txt @@ -0,0 +1,15 @@ +LICENSE +README.rst +setup.py +autocorrect/__init__.py +autocorrect/nlp_parser.py +autocorrect/nlp_parser_bn.py +autocorrect/utils.py +autocorrect/word.py +autocorrect/word_lists.py +autocorrect/word_lists_bn.py +autocorrect/words.bz2 +autocorrect.egg-info/PKG-INFO +autocorrect.egg-info/SOURCES.txt +autocorrect.egg-info/dependency_links.txt +autocorrect.egg-info/top_level.txt \ No newline at end of file diff --git a/autocorrect.egg-info/dependency_links.txt b/autocorrect.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/autocorrect.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/autocorrect.egg-info/top_level.txt b/autocorrect.egg-info/top_level.txt new file mode 100644 index 0000000..f15cbe7 --- /dev/null +++ b/autocorrect.egg-info/top_level.txt @@ -0,0 +1 @@ +autocorrect diff --git a/autocorrect/__init__.py b/autocorrect/__init__.py index 04bfb2d..7016b08 100644 --- a/autocorrect/__init__.py +++ b/autocorrect/__init__.py @@ -8,19 +8,32 @@ # http://www.opensource.org/licenses/mit-license.php """ Spell function - +Modified by Subrata Sarkar +https://github.com/SubrataSarkar32 Author: Jonas McCallum https://github.com/foobarmus/autocorrect """ from autocorrect.nlp_parser import NLP_COUNTS +from autocorrect.nlp_parser_bn import NLP_COUNTS as NLP_COUNTS_BN from autocorrect.word import Word, common, exact, known, get_case -def spell(word): - """most likely correction for everything up to a double typo""" - w = Word(word) - candidates = (common([word]) or exact([word]) or known([word]) or - known(w.typos()) or common(w.double_typos()) or - [word]) - correction = max(candidates, key=NLP_COUNTS.get) - return get_case(word, correction) +def spell(word,language='en'): + """The language parameter takes into account of the language. + most likely correction for everything up to a double typo""" + if(language == 'en'): + w = Word(word) + candidates = (common([word]) or exact([word]) or known([word]) or + known(w.typos()) or common(w.double_typos()) or + [word]) + correction = max(candidates, key=NLP_COUNTS.get) + return get_case(word, correction) + elif(language == 'bn'): + w = Word(word) + candidates = (common([word]) or exact([word]) or known([word]) or + known(w.typos()) or common(w.double_typos()) or + [word]) + correction = max(candidates, key=NLP_COUNTS_BN.get) + return get_case(word, correction) + else: + raise ValueError("This language is not supported") diff --git a/autocorrect/__pycache__/__init__.cpython-36.pyc b/autocorrect/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1c84e9b Binary files /dev/null and b/autocorrect/__pycache__/__init__.cpython-36.pyc differ diff --git a/autocorrect/__pycache__/nlp_parser.cpython-36.pyc b/autocorrect/__pycache__/nlp_parser.cpython-36.pyc new file mode 100644 index 0000000..c930a57 Binary files /dev/null and b/autocorrect/__pycache__/nlp_parser.cpython-36.pyc differ diff --git a/autocorrect/__pycache__/utils.cpython-36.pyc b/autocorrect/__pycache__/utils.cpython-36.pyc new file mode 100644 index 0000000..73ca5a7 Binary files /dev/null and b/autocorrect/__pycache__/utils.cpython-36.pyc differ diff --git a/autocorrect/__pycache__/word.cpython-36.pyc b/autocorrect/__pycache__/word.cpython-36.pyc new file mode 100644 index 0000000..64029b1 Binary files /dev/null and b/autocorrect/__pycache__/word.cpython-36.pyc differ diff --git a/autocorrect/__pycache__/word_lists.cpython-36.pyc b/autocorrect/__pycache__/word_lists.cpython-36.pyc new file mode 100644 index 0000000..3fc6188 Binary files /dev/null and b/autocorrect/__pycache__/word_lists.cpython-36.pyc differ diff --git a/autocorrect/nlp_parser.py b/autocorrect/nlp_parser.py index 813d4f2..728d730 100644 --- a/autocorrect/nlp_parser.py +++ b/autocorrect/nlp_parser.py @@ -13,7 +13,7 @@ https://github.com/foobarmus/autocorrect """ -from autocorrect.utils import words_from_archive, zero_default_dict +from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1 def parse(lang_sample): """tally word popularity using novel extracts, etc""" @@ -23,4 +23,14 @@ def parse(lang_sample): counts[word] += 1 return set(words), counts +def parse1(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive1(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + NLP_WORDS, NLP_COUNTS = parse('big.txt') +#NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt') +#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1 diff --git a/autocorrect/nlp_parser_bn.py b/autocorrect/nlp_parser_bn.py new file mode 100644 index 0000000..26a42df --- /dev/null +++ b/autocorrect/nlp_parser_bn.py @@ -0,0 +1,36 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +NLP parser + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1 + +def parse(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + +def parse1(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive1(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + +#NLP_WORDS_1, NLP_COUNTS_1 = parse('big.txt') +NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt') +#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1 diff --git a/autocorrect/utils.py b/autocorrect/utils.py index bdf2638..b7641af 100644 --- a/autocorrect/utils.py +++ b/autocorrect/utils.py @@ -13,14 +13,42 @@ https://github.com/foobarmus/autocorrect """ -import re, os, tarfile +import re, os, tarfile, io from contextlib import closing from itertools import chain PATH = os.path.abspath(os.path.dirname(__file__)) BZ2 = 'words.bz2' RE = '[A-Za-z]+' +RE1 = '[\w\s]+' +def words_from_file(filename, include_dups=False, map_case=False): + filepath=os.path.join(PATH, filename) + with io.open(filepath,'r',encoding='utf8') as f: + text = f.read() + words = text.split() + if include_dups: + return words + elif map_case: + return words + else: + return set(words) + +def words_from_archive1(filename, include_dups=False, map_case=False): + """extract words from a text file in the archive""" + bz2 = os.path.join(PATH, BZ2) + tar_path = '{}/{}'.format('words', filename) + with closing(tarfile.open(bz2, 'r:bz2')) as t: + with closing(t.extractfile(tar_path)) as f: + words = re.findall(RE1, f.read().decode(encoding='utf-8')) + if include_dups: + return words + elif map_case: + return {w.lower():w for w in words} + else: + return set(words) + + def words_from_archive(filename, include_dups=False, map_case=False): """extract words from a text file in the archive""" bz2 = os.path.join(PATH, BZ2) @@ -44,7 +72,7 @@ def concat(*args): class Zero(dict): """dict with a zero default""" - + def __getitem__(self, key): return self.get(key) diff --git a/autocorrect/word.py b/autocorrect/word.py index 8d501ae..6cb8207 100644 --- a/autocorrect/word.py +++ b/autocorrect/word.py @@ -18,7 +18,7 @@ from autocorrect.word_lists import LOWERCASE, MIXED_CASE from autocorrect.word_lists import LOWERED, CASE_MAPPED -ALPHABET = 'abcdefghijklmnopqrstuvwxyz' +ALPHABET = '''abcdefghijklmnopqrstuvwxyzঅআইঈউঊঋঌএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভময়সশষহযরড়ঢ়লৎঁংঃ্৷ািীুূৃেৈোৌ''' KNOWN_WORDS = LOWERCASE | LOWERED | NLP_WORDS class Word(object): diff --git a/autocorrect/word_lists.py b/autocorrect/word_lists.py index 12054d8..936211b 100644 --- a/autocorrect/word_lists.py +++ b/autocorrect/word_lists.py @@ -13,7 +13,7 @@ https://github.com/foobarmus/autocorrect """ -from autocorrect.utils import words_from_archive +from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1 # en_US_GB_CA is a superset of US, GB and CA # spellings (color, colour, etc). It contains @@ -29,10 +29,11 @@ # Colombo (mixed) LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt') +#LOWERCASE = words_from_archive1('bdict4.txt') # {'we', 'flew', 'to', 'via'} - -CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', - map_case=True) +#just add the list of words of the language which you wish to add with lowercase (if its devnagari type only this will suffice) +CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True) +CASE_MAPPED = {} # {abu': 'Abu', # 'dhabi': 'Dhabi', # 'colombo': 'Colombo'} diff --git a/autocorrect/word_lists_bn.py b/autocorrect/word_lists_bn.py new file mode 100644 index 0000000..15a385f --- /dev/null +++ b/autocorrect/word_lists_bn.py @@ -0,0 +1,52 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +Word lists for case sensitive/insensitive lookups + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1 + +# en_US_GB_CA is a superset of US, GB and CA +# spellings (color, colour, etc). It contains +# roughly half a million words. For this +# example, imagine it's just seven words... +# +# we (lower) +# flew (lower) +# to (lower) +# Abu (mixed) +# Dhabi (mixed) +# via (lower) +# Colombo (mixed) + +#LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt') +LOWERCASE = words_from_archive1('bdict4.txt') +# {'we', 'flew', 'to', 'via'} +#just add the list of words of the language which you wish to add with lowercase (if its devnagari type only this will suffice) +#CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True) +CASE_MAPPED = {} +# {abu': 'Abu', +# 'dhabi': 'Dhabi', +# 'colombo': 'Colombo'} +# +# Note that en_US_GB_CA_mixed.txt also contains +# acronyms/mixed case variants of common words, +# so in reality, CASE_MAPPED also contains: +# +# {'to': 'TO', +# 'via': 'Via'} + +MIXED_CASE = set(CASE_MAPPED.values()) +# {'Abu', 'Dhabi', 'Colombo'} + +LOWERED = set(CASE_MAPPED.keys()) +# {'abu', 'dhabi', 'colombo'} diff --git a/autocorrect/words.bz2 b/autocorrect/words.bz2 index 888c29f..646af2e 100644 Binary files a/autocorrect/words.bz2 and b/autocorrect/words.bz2 differ diff --git a/build/lib/autocorrect/__init__.py b/build/lib/autocorrect/__init__.py new file mode 100644 index 0000000..7016b08 --- /dev/null +++ b/build/lib/autocorrect/__init__.py @@ -0,0 +1,39 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +Spell function +Modified by Subrata Sarkar +https://github.com/SubrataSarkar32 +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.nlp_parser import NLP_COUNTS +from autocorrect.nlp_parser_bn import NLP_COUNTS as NLP_COUNTS_BN +from autocorrect.word import Word, common, exact, known, get_case + +def spell(word,language='en'): + """The language parameter takes into account of the language. + most likely correction for everything up to a double typo""" + if(language == 'en'): + w = Word(word) + candidates = (common([word]) or exact([word]) or known([word]) or + known(w.typos()) or common(w.double_typos()) or + [word]) + correction = max(candidates, key=NLP_COUNTS.get) + return get_case(word, correction) + elif(language == 'bn'): + w = Word(word) + candidates = (common([word]) or exact([word]) or known([word]) or + known(w.typos()) or common(w.double_typos()) or + [word]) + correction = max(candidates, key=NLP_COUNTS_BN.get) + return get_case(word, correction) + else: + raise ValueError("This language is not supported") diff --git a/build/lib/autocorrect/nlp_parser.py b/build/lib/autocorrect/nlp_parser.py new file mode 100644 index 0000000..728d730 --- /dev/null +++ b/build/lib/autocorrect/nlp_parser.py @@ -0,0 +1,36 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +NLP parser + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1 + +def parse(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + +def parse1(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive1(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + +NLP_WORDS, NLP_COUNTS = parse('big.txt') +#NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt') +#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1 diff --git a/build/lib/autocorrect/nlp_parser_bn.py b/build/lib/autocorrect/nlp_parser_bn.py new file mode 100644 index 0000000..26a42df --- /dev/null +++ b/build/lib/autocorrect/nlp_parser_bn.py @@ -0,0 +1,36 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +NLP parser + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1 + +def parse(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + +def parse1(lang_sample): + """tally word popularity using novel extracts, etc""" + words = words_from_archive1(lang_sample, include_dups=True) + counts = zero_default_dict() + for word in words: + counts[word] += 1 + return set(words), counts + +#NLP_WORDS_1, NLP_COUNTS_1 = parse('big.txt') +NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt') +#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1 diff --git a/build/lib/autocorrect/utils.py b/build/lib/autocorrect/utils.py new file mode 100644 index 0000000..b7641af --- /dev/null +++ b/build/lib/autocorrect/utils.py @@ -0,0 +1,85 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +File reader, concat function and dict wrapper + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +import re, os, tarfile, io +from contextlib import closing +from itertools import chain + +PATH = os.path.abspath(os.path.dirname(__file__)) +BZ2 = 'words.bz2' +RE = '[A-Za-z]+' +RE1 = '[\w\s]+' + +def words_from_file(filename, include_dups=False, map_case=False): + filepath=os.path.join(PATH, filename) + with io.open(filepath,'r',encoding='utf8') as f: + text = f.read() + words = text.split() + if include_dups: + return words + elif map_case: + return words + else: + return set(words) + +def words_from_archive1(filename, include_dups=False, map_case=False): + """extract words from a text file in the archive""" + bz2 = os.path.join(PATH, BZ2) + tar_path = '{}/{}'.format('words', filename) + with closing(tarfile.open(bz2, 'r:bz2')) as t: + with closing(t.extractfile(tar_path)) as f: + words = re.findall(RE1, f.read().decode(encoding='utf-8')) + if include_dups: + return words + elif map_case: + return {w.lower():w for w in words} + else: + return set(words) + + +def words_from_archive(filename, include_dups=False, map_case=False): + """extract words from a text file in the archive""" + bz2 = os.path.join(PATH, BZ2) + tar_path = '{}/{}'.format('words', filename) + with closing(tarfile.open(bz2, 'r:bz2')) as t: + with closing(t.extractfile(tar_path)) as f: + words = re.findall(RE, f.read().decode(encoding='utf-8')) + if include_dups: + return words + elif map_case: + return {w.lower():w for w in words} + else: + return set(words) + +def concat(*args): + """reversed('th'), 'e' => 'hte'""" + try: + return ''.join(args) + except TypeError: + return ''.join(chain.from_iterable(args)) + +class Zero(dict): + """dict with a zero default""" + + def __getitem__(self, key): + return self.get(key) + + def get(self, key): + try: + return super(Zero, self).__getitem__(key) + except KeyError: + return 0 + +zero_default_dict = Zero diff --git a/build/lib/autocorrect/word.py b/build/lib/autocorrect/word.py new file mode 100644 index 0000000..6cb8207 --- /dev/null +++ b/build/lib/autocorrect/word.py @@ -0,0 +1,115 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +Word based methods and functions + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import concat +from autocorrect.nlp_parser import NLP_WORDS +from autocorrect.word_lists import LOWERCASE, MIXED_CASE +from autocorrect.word_lists import LOWERED, CASE_MAPPED + +ALPHABET = '''abcdefghijklmnopqrstuvwxyzঅআইঈউঊঋঌএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভময়সশষহযরড়ঢ়লৎঁংঃ্৷ািীুূৃেৈোৌ''' +KNOWN_WORDS = LOWERCASE | LOWERED | NLP_WORDS + +class Word(object): + """container for word-based methods""" + + def __init__(self, word): + """ + Generate slices to assist with typo + definitions. + + 'the' => (('', 'the'), ('t', 'he'), + ('th', 'e'), ('the', '')) + + """ + word_ = word.lower() + slice_range = range(len(word_) + 1) + self.slices = tuple((word_[:i], word_[i:]) + for i in slice_range) + self.word = word + + def _deletes(self): + """th""" + return {concat(a, b[1:]) + for a, b in self.slices[:-1]} + + def _transposes(self): + """teh""" + return {concat(a, reversed(b[:2]), b[2:]) + for a, b in self.slices[:-2]} + + def _replaces(self): + """tge""" + return {concat(a, c, b[1:]) + for a, b in self.slices[:-1] + for c in ALPHABET} + + def _inserts(self): + """thwe""" + return {concat(a, c, b) + for a, b in self.slices + for c in ALPHABET} + + def typos(self): + """letter combinations one typo away from word""" + return (self._deletes() | self._transposes() | + self._replaces() | self._inserts()) + + def double_typos(self): + """letter combinations two typos away from word""" + return {e2 for e1 in self.typos() + for e2 in Word(e1).typos()} + + +def common(words): + """{'the', 'teh'} => {'the'}""" + return set(words) & NLP_WORDS + +def exact(words): + """{'Snog', 'snog', 'Snoddy'} => {'Snoddy'}""" + return set(words) & MIXED_CASE + +def known(words): + """{'Gazpacho', 'gazzpacho'} => {'gazpacho'}""" + return {w.lower() for w in words} & KNOWN_WORDS + +def known_as_lower(words): + """{'Natasha', 'Bob'} => {'bob'}""" + return {w.lower() for w in words} & LOWERCASE + +def get_case(word, correction): + """ + Best guess of intended case. + + manchester => manchester + chilton => Chilton + AAvTech => AAvTech + THe => The + imho => IMHO + + """ + if word.istitle(): + return correction.title() + if word.isupper(): + return correction.upper() + if correction == word and not word.islower(): + return word + if len(word) > 2 and word[:2].isupper(): + return correction.title() + if not known_as_lower([correction]): #expensive + try: + return CASE_MAPPED[correction] + except KeyError: + pass + return correction diff --git a/build/lib/autocorrect/word_lists.py b/build/lib/autocorrect/word_lists.py new file mode 100644 index 0000000..936211b --- /dev/null +++ b/build/lib/autocorrect/word_lists.py @@ -0,0 +1,52 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +Word lists for case sensitive/insensitive lookups + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1 + +# en_US_GB_CA is a superset of US, GB and CA +# spellings (color, colour, etc). It contains +# roughly half a million words. For this +# example, imagine it's just seven words... +# +# we (lower) +# flew (lower) +# to (lower) +# Abu (mixed) +# Dhabi (mixed) +# via (lower) +# Colombo (mixed) + +LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt') +#LOWERCASE = words_from_archive1('bdict4.txt') +# {'we', 'flew', 'to', 'via'} +#just add the list of words of the language which you wish to add with lowercase (if its devnagari type only this will suffice) +CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True) +CASE_MAPPED = {} +# {abu': 'Abu', +# 'dhabi': 'Dhabi', +# 'colombo': 'Colombo'} +# +# Note that en_US_GB_CA_mixed.txt also contains +# acronyms/mixed case variants of common words, +# so in reality, CASE_MAPPED also contains: +# +# {'to': 'TO', +# 'via': 'Via'} + +MIXED_CASE = set(CASE_MAPPED.values()) +# {'Abu', 'Dhabi', 'Colombo'} + +LOWERED = set(CASE_MAPPED.keys()) +# {'abu', 'dhabi', 'colombo'} diff --git a/build/lib/autocorrect/word_lists_bn.py b/build/lib/autocorrect/word_lists_bn.py new file mode 100644 index 0000000..15a385f --- /dev/null +++ b/build/lib/autocorrect/word_lists_bn.py @@ -0,0 +1,52 @@ +# Python 3 Spelling Corrector +# +# Copyright 2014 Jonas McCallum. +# Updated for Python 3, based on Peter Norvig's +# 2007 version: http://norvig.com/spell-correct.html +# +# Open source, MIT license +# http://www.opensource.org/licenses/mit-license.php +""" +Word lists for case sensitive/insensitive lookups + +Author: Jonas McCallum +https://github.com/foobarmus/autocorrect + +""" +from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1 + +# en_US_GB_CA is a superset of US, GB and CA +# spellings (color, colour, etc). It contains +# roughly half a million words. For this +# example, imagine it's just seven words... +# +# we (lower) +# flew (lower) +# to (lower) +# Abu (mixed) +# Dhabi (mixed) +# via (lower) +# Colombo (mixed) + +#LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt') +LOWERCASE = words_from_archive1('bdict4.txt') +# {'we', 'flew', 'to', 'via'} +#just add the list of words of the language which you wish to add with lowercase (if its devnagari type only this will suffice) +#CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True) +CASE_MAPPED = {} +# {abu': 'Abu', +# 'dhabi': 'Dhabi', +# 'colombo': 'Colombo'} +# +# Note that en_US_GB_CA_mixed.txt also contains +# acronyms/mixed case variants of common words, +# so in reality, CASE_MAPPED also contains: +# +# {'to': 'TO', +# 'via': 'Via'} + +MIXED_CASE = set(CASE_MAPPED.values()) +# {'Abu', 'Dhabi', 'Colombo'} + +LOWERED = set(CASE_MAPPED.keys()) +# {'abu', 'dhabi', 'colombo'} diff --git a/build/lib/autocorrect/words.bz2 b/build/lib/autocorrect/words.bz2 new file mode 100644 index 0000000..646af2e Binary files /dev/null and b/build/lib/autocorrect/words.bz2 differ diff --git a/dist/autocorrect-0.3.0-py3-none-any.whl b/dist/autocorrect-0.3.0-py3-none-any.whl new file mode 100644 index 0000000..66fe442 Binary files /dev/null and b/dist/autocorrect-0.3.0-py3-none-any.whl differ