-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataExtraction.py
150 lines (122 loc) · 6.77 KB
/
dataExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import sys
import bs4
import wget
import requests
import pandas as pd
import matplotlib.pyplot as plt
from pdfminer.high_level import extract_text
WEB_PAGE_ROOT = "https://www.education.ie/en/Publications/Inspection-Reports-Publications/Whole-School-Evaluation-Reports-List/?pageNumber="
def PDFToText(path: str) -> str:
try:
extracted_report = extract_text(path)
except (Exception) as e:
print(f"Error converting {path}: {e}", file=sys.stderr)
return ""
return extracted_report
def DownloadPDF(numberOfPages: int, exportPath="./") -> [str]:
General_InspectionReports = pd.DataFrame(columns=['Date','School Roll No.','County','School Name','School Level','Inspection Type','Subject','URL'])
for pageNumber in range(1, numberOfPages + 1):
IrelandWebpage = requests.get(WEB_PAGE_ROOT + str(pageNumber))
CleanIrelandWebpage = bs4.BeautifulSoup(IrelandWebpage.text, "lxml")
InspectionReports = {}
ID = 0
Table = CleanIrelandWebpage.find('table', id="IRList")
for p in Table.find_all('tr'):
if ID == 0:
ID = ID + 1
continue
else:
Date = p('td')[0].string[:2] + '_' + p('td')[0].string[3:5] + '_' + p('td')[0].string[6:]
SchoolRoll = p('td')[1].string
County = p('td')[2].string
SchoolName = p('td')[3].string
SchoolLevel = p('td')[4].string
InspectionType = p('td')[5].string
Subject = p('td')[6].string
URL = p('td')[7]('a')[0].attrs['href'][86:]
InspectionReports[ID] = {'Date': Date, 'School Roll No.': SchoolRoll, 'County': County, 'School Name': SchoolName, 'School Level': SchoolLevel, 'Inspection Type': InspectionType, 'Subject': Subject, 'URL': URL}
ID = ID + 1
df_InspectionReports = pd.DataFrame.from_dict(InspectionReports, orient='index')
General_InspectionReports = pd.concat([General_InspectionReports,df_InspectionReports])
print(f"Number of reports to download: {len(General_InspectionReports)}")
PDFToConvert = []
exported = []
for index, row in General_InspectionReports.iterrows():
DownloadURL = 'https://www.education.ie/en/Publications/Inspection-Reports-Publications/Whole-School-Evaluation-Reports-List/' + row['URL']
if exportPath[-1] != '/': exportPath += '/'
FileName = exportPath + row['School Roll No.'] + '_' + row['Date'] + '.pdf'
print('\tReport ' + row['School Roll No.'] + ' downloaded')
wget.download(DownloadURL, FileName)
exported.append(FileName)
return exported
if __name__ == "__main__":
DOWNLOAD_PDF = False # If you want/need to download pdf, change this var to True
NUMBER_OF_PAGES = 200
PATH_TO_ALL_REPORTS = "./Reports"
PATH_TO_PDF_REPORTS = PATH_TO_ALL_REPORTS + "/pdf/"
PATH_TO_TEXT_REPORTS = PATH_TO_ALL_REPORTS + "/plain_text"
General_InspectionReports = pd.DataFrame(columns=['Date','School Roll No.','County','School Name','School Level','Inspection Type','Subject','URL'])
if os.path.exists(PATH_TO_ALL_REPORTS) == False: os.mkdir(PATH_TO_ALL_REPORTS)
if os.path.exists(PATH_TO_PDF_REPORTS) == False: os.mkdir(PATH_TO_PDF_REPORTS)
# Turn web table into Datafram
if DOWNLOAD_PDF:
for pageNumber in range(1, NUMBER_OF_PAGES + 1):
IrelandWebpage = requests.get(WEB_PAGE_ROOT + str(pageNumber))
CleanIrelandWebpage = bs4.BeautifulSoup(IrelandWebpage.text, "lxml")
InspectionReports = {}
ID = 0
Table = CleanIrelandWebpage.find('table', id="IRList")
for p in Table.find_all('tr'):
if ID == 0:
ID = ID + 1
continue
else:
Date = p('td')[0].string[:2] + '_' + p('td')[0].string[3:5] + '_' + p('td')[0].string[6:]
SchoolRoll = p('td')[1].string
County = p('td')[2].string
SchoolName = p('td')[3].string
SchoolLevel = p('td')[4].string
InspectionType = p('td')[5].string
Subject = p('td')[6].string
URL = p('td')[7]('a')[0].attrs['href'][86:]
InspectionReports[ID] = {'Date': Date, 'School Roll No.': SchoolRoll, 'County': County, 'School Name': SchoolName, 'School Level': SchoolLevel, 'Inspection Type': InspectionType, 'Subject': Subject, 'URL': URL}
ID = ID + 1
df_InspectionReports = pd.DataFrame.from_dict(InspectionReports, orient='index')
General_InspectionReports = pd.concat([General_InspectionReports,df_InspectionReports])
print(f"Number of reports to download: {len(General_InspectionReports)}")
# Download PDF
PDFToConvert = []
for index, row in General_InspectionReports.iterrows():
DownloadURL = 'https://www.education.ie/en/Publications/Inspection-Reports-Publications/Whole-School-Evaluation-Reports-List/' + row['URL']
FileName = 'Reports/pdf/' + row['School Roll No.'] + '_' + row['Date'] + '.pdf'
PDFToConvert.append('Reports/pdf/' + row['School Roll No.'] + '_' + row['Date'])
print('\tReport ' + row['School Roll No.'] + ' downloaded')
wget.download(DownloadURL, FileName)
# Converte PDF to text and remove useless data
print("\nProcessing data...")
else:
PDFToConvert = os.listdir(PATH_TO_PDF_REPORTS)
print(PDFToConvert[0:10])
if os.path.exists(PATH_TO_TEXT_REPORTS) == False:
os.mkdir(PATH_TO_TEXT_REPORTS)
ConvertionCategories = {"Properly processed":0, "Not in text format":0, "Cannot be processed":0}
FilesProperlyConverted = {}
FilesNotConverted = []
NUMBER_OF_PDF = len(PDFToConvert)
for index, PDF in enumerate(PDFToConvert):
print(f"{index / NUMBER_OF_PDF * 100:.1f}%\t-\t{len(FilesNotConverted)}\t-\t{index}\t-\t{NUMBER_OF_PDF}")
try:
extracted_report = PDFToText(f"{PATH_TO_PDF_REPORTS + PDF}")
if "ú" in extracted_report :
FilesNotConverted.append(PDF[len('Reports/pdf/'):])
continue
with open(f"{PATH_TO_TEXT_REPORTS}/{PDF[:-4]}.txt" ,"w+") as f:
f.write(extracted_report)
except (Exception) as e:
ConvertionCategories["Cannot be processed"] = ConvertionCategories["Cannot be processed"] + 1
FilesNotConverted.append(PDF[len('Reports/pdf/'):])
print(PDF[len('Reports/pdf/'):] + f' could not be processed: {e}', file=sys.stderr)
continue
print("Data sucessfuly processed !")
print(f"Number of errors during process: {len(FilesNotConverted)}\t{len(FilesNotConverted) / NUMBER_OF_PDF * 100}%")