forked from SteveHedden/cryptoKGTutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfToTxt.py
22 lines (21 loc) · 802 Bytes
/
pdfToTxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import PyPDF2
import os
directory = <directory>
rawWhitePapers = directory + "/rawWhitePapers"
txtWhitePapers = directory + "/txtWhitePapers"
for filename in os.listdir(rawWhitePapers):
print(filename)
if filename == ".DS_Store":
pass
else:
pdffileobj = open(directory + "/rawWhitePapers" + "/" + filename, 'rb')
# create reader variable that will read the pdffileobj
pdfreader = PyPDF2.PdfFileReader(pdffileobj)
# This will store the number of pages of this pdf file
x = pdfreader.numPages
for page in range(x):
pageobj = pdfreader.getPage(page)
text = pageobj.extractText()
file1 = open(txtWhitePapers + "/" + filename + ".txt", "a")
file1.writelines(text)
file1.close()