-
Notifications
You must be signed in to change notification settings - Fork 146
Expand file tree
/
Copy pathpreprocess.py
More file actions
executable file
·59 lines (50 loc) · 1.69 KB
/
preprocess.py
File metadata and controls
executable file
·59 lines (50 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
"""
Example: python data/vocab.txt data/train.txt
vocab.txt: 1stline=word, 2ndline=count
"""
import os
import sys
from codegen_sources.model.src.data.dictionary import Dictionary
from codegen_sources.model.src.logger import create_logger
logger = create_logger(None, 0)
def XLM_preprocess(voc_path, txt_path, bin_path):
assert os.path.isfile(voc_path)
assert os.path.isfile(txt_path)
dico = Dictionary.read_vocab(voc_path)
logger.info("")
data = Dictionary.index_data(txt_path, bin_path, dico)
logger.info(
"%i words (%i unique) in %i sentences."
% (
len(data["sentences"]) - len(data["positions"]),
len(data["dico"]),
len(data["positions"]),
)
)
if len(data["unk_words"]) > 0:
logger.info(
"%i unknown words (%i unique), covering %.2f%% of the data."
% (
sum(data["unk_words"].values()),
len(data["unk_words"]),
sum(data["unk_words"].values())
* 100.0
/ (len(data["sentences"]) - len(data["positions"])),
)
)
if len(data["unk_words"]) < 30000:
for w, c in sorted(data["unk_words"].items(), key=lambda x: x[1])[::-1][
:30
]:
logger.info("%s: %i" % (w, c))
if __name__ == "__main__":
voc_path_arg = sys.argv[1]
txt_path_arg = sys.argv[2]
bin_path_arg = sys.argv[2] + ".pth"
XLM_preprocess(voc_path_arg, txt_path_arg, bin_path_arg)