forked from grobidOrg/grobid-client-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathxmltodict.py
More file actions
96 lines (86 loc) · 3.6 KB
/
xmltodict.py
File metadata and controls
96 lines (86 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import datetime
import random
import string
import hashlib
import argparse
import sys
import tkinter as tk
from tkinter import filedialog
import shutil
from tkinter.constants import FALSE
import xmltodict
import pprint
from pathlib import Path
from grobid_client.grobid_client import GrobidClient
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--slug', action='store_true', default=False)
parser.add_argument('-c', '--no-newline', action='store_true', default=False)
parser.add_argument('-n', '--number', action='store', type=int, default=1)
def random_md5(string_length=25, slug=False, number=1):
hashes = []
for n in range(number):
r = ''.join(random.choice(string.ascii_letters + string.digits) for i in range(string_length)).encode('utf-8')
m = hashlib.md5()
m.update(r)
if slug == True:
hashes.append(m.hexdigest()[:6])
else:
hashes.append(m.hexdigest())
return hashes
if __name__ == '__main__':
arguments = parser.parse_args()
hashes = random_md5(slug=arguments.slug, number=arguments.number)
if (arguments.number == 1 and arguments.no_newline == True):
sys.stdout.write(hashes[0])
else:
for hash in hashes:
print (hash)
id_paper_input = "./resources/input_pdf/" + str(hash) + '_' + str(datetime.datetime.now().date()) + '_' + str(datetime.datetime.now().time()).replace(':', '.')
id_paper_output = "./resources/output_pdf/" + str(hash) + '_' + str(datetime.datetime.now().date()) + '_' + str(datetime.datetime.now().time()).replace(':', '.')
def main():
if not os.path.exists(id_paper_input):
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
print(file_path)
os.makedirs(id_paper_input)
os.makedirs(id_paper_output)
print("Directory " , id_paper_input, id_paper_output, " Created ")
shutil.copy(file_path, id_paper_input)
else:
print("Directory " , id_paper_input , " already exists")
if __name__ == "__main__":
main()
client = GrobidClient(config_path="./config.json")
client.process("processHeaderDocument", id_paper_input, id_paper_output, consolidate_citations=True, force=True)
for file in Path(id_paper_output).iterdir():
if file.suffix == '.xml':
with open(file, 'r', encoding='utf-8') as file:
xml = file.read()
DEBUG = True
paper_dict = xmltodict.parse(xml)
try:
title = paper_dict['TEI']['teiHeader']['fileDesc']['titleStmt']['title']['#text']
except:
title = ""
if DEBUG:
print("Unable to find title")
try:
abstract = paper_dict['TEI']['teiHeader']['profileDesc']['abstract']['p']
except:
abstract = ""
if DEBUG:
print("Unable to find abstract")
try:
if "term" in paper_dict['TEI']['teiHeader']['profileDesc']['textClass']['keywords']:
keywords = ", ".join(paper_dict['TEI']['teiHeader']['profileDesc']['textClass']['keywords']['term'])
else:
keywords = paper_dict['TEI']['teiHeader']['profileDesc']['textClass']['keywords']
except:
keywords = ""
if DEBUG:
print("Unable to find keywords")
print("Title: {}".format(title))
print("Abstract: {}".format(abstract))
print("Keywords: {}".format(keywords))