-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_extract.py
95 lines (86 loc) · 3.37 KB
/
data_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pytesseract
from PIL import Image
import datetime
import cv2
import sys
import os
import os.path
import re
import numpy as np
#class to extract text from an image where the image file is passed as an argument
class Text_Extractor():
#Constructor
def __init__(self,image_file):
self.image_file=image_file
if self is None:
return 0
#Function to extract the text from image as string
def extract_text(self):
#img=Image.open(self.image_file)
img = cv2.imread(self.image_file)
#resize the image
img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
#convert the image to gray
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#the following command uses the tesseract directory path to get the trained data in the config option
text=pytesseract.image_to_string(img, lang='eng', config='"C:\\Program Files (x86)\\Tesseract-OCR\\testdata"')
return text
#class to validate if an image is a adhar card where the text is passed as an argument
class Aadhar_Card_Validator():
#Constructor
def __init__(self,text):
self.text=text
#Function to validate if an image contains text showing its an aadhar card
def is_aadhar_card(self):
res=self.text.split()
# print(res)
dates={}
if 'GOVERNMENT OF INDIA' in self.text:
print ("Aadhar card is valid and the details are below:")
index=res.index('INDIA' or 'INDIYA')
name=''
if res[index+3].isalpha():
name= res[index+3] + " " + res[index+4] + " " + res[index+5]
else :
name= res[index+4] + " " + res[index+5] + " " + res[index+6]
else:
print(res) #To check the text items extracted by OCR
name=res[0] + " " + res[1]
if len(name)>1:
print("Name: " + name)
else:
print("Name not read")
#For extraction of date
p = re.compile(r'\d{2}/\d{2}/\d{4}')
if (p.findall(self.text)):
dates=p.findall(self.text)
if len(dates)>0 and len(dates[0])>1:
print("Date of birth: "+ str(dates[0]))
aadhar_number=''
for word in res:
if 'yob' in word.lower():
yob=re.findall('d+', word)
if yob:
print ('Year of Birth : ' + yob[0])
if len(word) == 4 and word.isdigit():
aadhar_number=aadhar_number + word + ' '
if len(aadhar_number)>=14:
print("Aadhar number is: "+ aadhar_number)
else:
print("Aadhar number not read")
print("Try again or try another file")
def main():
if len(sys.argv) != 2:
print ("Wrong number of arguments")
sys.exit(1)
image_file_name = sys.argv[1]
# Check for right infilename extension.
file_ext = os.path.splitext(image_file_name)[1]
if file_ext.upper() not in ('.JPG', '.PNG' ):
print( "Input filename extension should be .JPG or .PNG")
sys.exit(1)
te= Text_Extractor(image_file_name)
text=te.extract_text()
acv=Aadhar_Card_Validator(text)
acv.is_aadhar_card()
main()