-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathdrawingNum.py
107 lines (87 loc) · 4.66 KB
/
drawingNum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import cv2
import pytesseract
from matplotlib import pyplot as pt
import numpy as np
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
# Page segmentation mode, PSM was changed to 6 since each page is a single uniform text block.
def GetString(img, keyword1, keyword2):
copy = img.copy()
[nrow, ncol]= img.shape
blur = cv2.GaussianBlur(copy, (3,3), 0)
ret, thresh = cv2.threshold(blur, 127, 1, cv2.THRESH_BINARY_INV)
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = cv2.contourArea(contour)
if (area > 40000 and area < 5000000):
x,y,w,h = cv2.boundingRect(contour)
cv2.rectangle(img, (x,y), (x+w, y+h), (36,255,12), -1)
ROI = copy[y:y+h, x:x+w]
string = (pytesseract.image_to_string(ROI, config ='--psm 6')).strip()
if (string == ""):
return
# pt.figure()
# pt.imshow(ROI, cmap = "gray")
if (keyword1 in string or keyword2 in string):
ROI = copy[y:y+h+100, x+10:x+w] # we take a larger area of the box identified
copyROI = ROI.copy()
ret, thresh = cv2.threshold(ROI, 0, 255, cv2.THRESH_BINARY_INV)
#--- Remove any potential boxes surrounding the letters which can impair extraction through OCR ---#
# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(copyROI, [c], -1, (255,255,255), 5)
# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,30))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(copyROI, [c], -1, (255,255,255), 5)
# --- Final Reading of Box --- #
string = (pytesseract.image_to_string(copyROI, config ='--psm 6')).strip()
string = string.splitlines()
extracted_string = ""
for i in range(len(string)):
if keyword1 in string[i] or keyword2 in string[i]:
indexOfValue = i
while extracted_string == "":
indexOfValue = indexOfValue + 1
if ((indexOfValue) < len(string)): # if true means that this string has this index
extracted_string = string[indexOfValue]
else:
break
return extracted_string
return extracted_string
break
# img = cv2.imread("08.png", 0)
# data_extract = {}
# drawingNum = GetString(img, "DRAWING NUMBER", "DRAWING NO")
# drawnBy = GetString(img, "DRAWN BY", "DRAWN")
# checkedBy = GetString(img, "CHECKED BY", "CHECKED")
# title = GetString(img, "TITLE", "DRAWING TITLE")
# approvedBy = GetString(img, "APPPROVED BY", "APPROVED")
# contractor = GetString(img, "CONTRACTOR", "COMPANY")
# unit = GetString(img, "UNIT", "UNIT")
# status = GetString(img, "STATUS", "STATUS")
# page = GetString(img, "PAGE", "PAGE")
# projectNum = GetString(img, "PROJECT NO", "PROJECT NUM")
# lang = GetString(img, "LANG", "LANG")
# cad = GetString(img, "CAD NO", "CAD")
# font = GetString(img, "FONT", "FONT STYLE")
# data_extract["drawing number"] = drawingNum
# data_extract["drawn by"] = drawnBy
# data_extract["checked by"] = checkedBy
# data_extract["title"] = title
# data_extract["approved by"] = approvedBy
# data_extract["contractor"] = contractor
# data_extract["unit"] = unit
# data_extract["status"] = status
# data_extract["page"] = page
# data_extract["projectNum"] = projectNum
# data_extract["lang"] = lang
# data_extract["cad"] = cad
# data_extract["font"] = font
# print(data_extract)