-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr.py
More file actions
93 lines (71 loc) · 3.1 KB
/
ocr.py
File metadata and controls
93 lines (71 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import os
import subprocess
from os import listdir
from os.path import isfile, join
#getting number of pages in a pdf
from subprocess import check_output
#to calculate time
from timeit import default_timer as timer
################ ARGUEMENTS #######################
#print("Arguements List :" ,str(sys.argv))
# getting file name and path of the file
#getopt.getopt(args, options, [long_options])
#print(__file__)
#extracting file-name from path
# head, tail = os.path.split(input_file_path)
# print("File name : ",tail)
############### GLOBAL VARIABLES HERE #############
all_input_file_path = "./input"
#getting only all the files into a list
only_files = [f for f in listdir(all_input_file_path) if isfile(join(all_input_file_path, f))]
########################################################################
#################### COUNTING PAGES IN A PDF ###########################
########################################################################
# def get_num_pages(pdf_path):
# # returns the number of pages from a given file
# output = check_output(["pdfinfo", pdf_path]).decode()
# pages_line = [line for line in output.splitlines() if "Pages:" in line][0]
# num_pages = int(pages_line.split(":")[1])
# return num_pages
########################################################################
total_pages = 0; total_files = len(only_files)
def to_text(
path_to_jpg_files
):
#print("path_to_jpg_files :", path_to_jpg_files)
jpg_files = [f for f in listdir("./temp") if isfile(join("./temp", f))]
for index, i_file in enumerate(jpg_files):
input_file = "./temp/"+ i_file
#print("working directory :",os.getcwd())
#print("input file for tesseract is {}".format(input_file))
#print("output file would be {}", output_file_path)
o_file = os.path.splitext(i_file)[0]
output_file_path = "./output/"+o_file
#print(" output file name " ,o_file)
command = "tesseract "+ input_file +" "+ output_file_path +" -l eng --oem 1 --psm 3"
#print("command ", command)
subprocess.run([command], shell= True)
os.remove(input_file)
#print("path_to_text :", path_to_text)
def to_jpg(
input_file
):
#convert each page from pdf file to jpg
input_file_path = all_input_file_path + "/" + input_file
head, tail = os.path.splitext(input_file)
output_file_name = "./temp/" + head + '.jpg'
#ERROR 1
command = "convert -density 150 "+ input_file_path +" "+ output_file_name+""
subprocess.run([command], shell = True)
#print("Done with {} forwarding to pdf".format(input_file_path))
to_text(output_file_name)
for index, input_file in enumerate(only_files):
#print("File {}: {} ".format(index,input_file))
#input_file_path = all_input_file_path + "/" + input_file
#print(" Input file path : " ,input_file_path)
to_jpg(input_file)
# output_file_path = "./output/output"
# how to run a command line statement inside python file
# command = "tesseract "+ input_file_path +" "+ output_file_path +" -l eng --oem 1 --psm 3"
# subprocess.run([command], shell= True)