forked from andreamust/OCR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr_pdf.py
123 lines (100 loc) · 4.49 KB
/
ocr_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from pdf2image import convert_from_path
import os
import ntpath
import cv2
import pytesseract
import numpy as np
import re
PDF_PATH = "/Users/andreapoltronieri/Documents/Assegno/Polifonia/WP4/Books/Dictionnaire_de_musique_Rousseau"
OUTPUT_PATH = "/Users/andreapoltronieri/Documents/Assegno/Polifonia/WP4/Books/"
OUTPUT_FORMAT = "png"
OUTPUT_NAME = "scanned.txt"
LANGUAGE = 'fra'
def file_info(file_path, out_path):
"""Takes as input the file input path and the output path and analyses the file name and the file extension.
It also set a destination folder for the conversion from pdf to image."""
file_name = ntpath.basename(file_path)
file_name_no_ext = os.path.splitext(file_name)[0]
file_ext = os.path.splitext(file_name)[1]
results_path = out_path + file_name_no_ext
return file_name_no_ext, file_ext, results_path
def pdf_to_img(file_path, out_path, out_format="png"):
file_name_no_ext, file_ext, results_path = file_info(file_path, out_path)
# check if exists a folder with the same name of the input file. If not, create one.
if not os.path.isdir(results_path):
os.makedirs(results_path)
else:
pass
pages = convert_from_path(file_path, 500)
page_num = 0
for page in pages:
page_num += 1
page.save("{}/{}.{}".format(results_path, page_num, out_format), out_format)
print("SAVING IMAGE: {}".format(page))
def image_processing(input_path, gray_scale=True, remove_noise=True, tresholding=True, dilate=True, erosion=True, edge_detection=True, skew_correction=True):
kernel = np.ones((5, 5), np.uint8)
image = cv2.imread(input_path)
if gray_scale:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if remove_noise:
image = cv2.medianBlur(image, 5)
if tresholding:
image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
if dilate:
image = cv2.dilate(image, kernel, iterations=1)
if erosion:
image = cv2.erode(image, kernel, iterations=1)
if edge_detection:
image = cv2.Canny(image, 100, 200)
if skew_correction:
coords = np.column_stack(np.where(image > 0))
angle = cv2.minAreaRect(coords)[-1]
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return image
# open image without columns
#text_result = pytesseract.image_to_string(Image.open(r'C:\Users\Utente\Documents\Polifonia\OCR\materiale da scannerizzare\ArpaOriginale\Arpa-1853-025-1.jpg'), lang='ita')
def ocr(processed_image, language):
custom_config = r'--oem 3 --psm 6'
ocr_output = pytesseract.image_to_string(processed_image, config=custom_config, lang=language)
return ocr_output
# alternative code to process multilingual text
# List of available languages
# print(pytesseract.get_languages(config=''))
def ocr(processed_image):
custom_config = r'-l fra+eng+ita+spa --psm 6' # code for 4 languages : italian, french, english, spanish
ocr_output = pytesseract.image_to_string(img, config=custom_config)
return ocr_output
def save_to_txt(out_name, ocr_res):
f = open(out_name, 'w')
f.write(ocr_res)
f.close()
if __name__ == "__main__":
file_name, extension, final_path = file_info(PDF_PATH, OUTPUT_PATH)
if extension == ".pdf" and final_path:
pass
print("The input file is a .pdf file. Converting to image in {} format.".format(OUTPUT_FORMAT))
pdf_to_img(PDF_PATH, OUTPUT_PATH, OUTPUT_FORMAT)
else:
# final_path = PDF_PATH
print("THE DESTINATION FOLDER IS NOT EMPTY. PROCESSING THE FILES CONTAINED IN IT.")
print("PROCESSING FOLDER: {}".format(final_path))
ocr_all = ""
for path, dirs, images in os.walk(final_path):
for image in sorted(images, key=lambda f: int(re.sub('\D', '1', f))):
filename, file_extension = os.path.splitext(image)
if file_extension == ".{}".format(OUTPUT_FORMAT):
print("PROCESSING IMAGE: {}/{}".format(path, image))
image = image_processing("{}/{}".format(path, image))
image_ocr = ocr(image, LANGUAGE)
else:
print("UNABLE TO PROCESS FILE: {}".format(image))
continue
ocr_all = ocr_all + "\n" + image_ocr
save_to_txt(file_name + ".txt", ocr_all)