-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathToDocx.py
More file actions
66 lines (58 loc) · 2.36 KB
/
ToDocx.py
File metadata and controls
66 lines (58 loc) · 2.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from docx import Document
from docx.enum.section import WD_SECTION
# #*#import easyocr
# #*#import torch
import pytesseract
import TextStyle
from docx.shared import Pt
from stqdm import stqdm
import os
import warnings
warnings.filterwarnings('ignore')
import locale
locale.getpreferredencoding()
for r, s, f in os.walk("/"):
for i in f:
if "tesseract" in i:
os.path.join(r, i)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
# tessdata_dir_config = "--tessdata-dir 'C:\\Program Files \\Tesseract-OCR\\tessdata\\"
def im2dox(file, language, confidence):
document = Document() # creating Word document instance
# #*# torch.cuda.empty_cache()
# #*# reader = easyocr.Reader([language], gpu=False) # calling ocr reader
print("Extracting text to docx file")
for ref, page in enumerate(file):
print("Reading page {}".format(ref+1))
for segment in stqdm(page):
p = document.add_paragraph() # for each text layout segment, a separate paragraph is created
# #*#result = reader.readtext(segment) # and filled with predicted text
result = pytesseract.image_to_string(segment, lang=language, config='--psm 6')
font_style = TextStyle.text_style(segment, confidence)
# #*#for (bbox, text, prob) in result:
for t in result:
try:
text = t.encode("cp850").decode("cp850")
except UnicodeEncodeError:
print(t)
if font_style != 'Cursive':
try:
run = p.add_run(text)
run.font.size = Pt(14)
except ValueError:
del text
else:
try:
run = p.add_run(text)
run.font.name = 'Brush Script MT' # writing cursive font text
run.font.size = Pt(14)
except ValueError:
del text
document.add_section(start_type=WD_SECTION.NEW_PAGE)
return document
def txt2dox(text):
document = Document() # creating Word document instance
p = document.add_paragraph()
p.add_run(text) # writing standard font text
return document
# #*# for use with easyocr lib