-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmain.py
134 lines (116 loc) · 5.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests
from bs4 import BeautifulSoup
from time import sleep
import cairosvg
import os
from PyPDF2 import PdfFileMerger
from fpdf import FPDF
from PIL import Image
import urllib3.contrib.pyopenssl
import progressbar
#Configurations
book_url_list = []
only_pdf = False
with requests.Session() as s:
for book_url in book_url_list:
if not only_pdf:
image_list = []
pdf_list = []
#
#Download from SVG to PDF
#
print("--> Downloading from SVG to PDF")
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
r = s.get(book_url, headers=headers)
parsed_html = BeautifulSoup(r.text, "lxml")
og_link = parsed_html.head.find('meta', attrs={'property':'og:image'})["content"].replace("1.svgz","").replace("1.jpg","")
book_description = parsed_html.head.find('meta', attrs={'name':'description'})["content"]
pre_book_title = (book_description.find("Title:"))
post_book_title = (book_description.find(", Author"))
book_title = book_description[pre_book_title+7:post_book_title]
pre_book_length = (book_description.find("Length:"))
post_book_length = (book_description.find(" pages,"))
book_length = book_description[pre_book_length+8:post_book_length]
#Download all SVG files
print("Downloading all "+book_length+" SVG files...")
for i in progressbar.progressbar(range(int(book_length))):
while True:
try:
r = s.get(og_link+str(i+1)+".svgz",headers=headers)
break
except:
#print("Error")
continue
with open('output'+str(i+1)+".svg", 'wb') as out_file:
out_file.write(r.content)
image_list.append("output"+str(i+1)+".svg")
#Convert SVG to PDF
print("Converting all SVG files to PDF...")
with progressbar.ProgressBar(max_value=int(book_length)) as bar:
for idx, image in enumerate(image_list):
cairosvg.svg2pdf(url=image, write_to="output"+str(idx+1)+".pdf")
pdf_list.append("output"+str(idx+1)+".pdf")
bar.update(idx)
#Join all PDF's
print("Joining and exporting all PDFs...")
merger = PdfFileMerger()
with progressbar.ProgressBar(max_value=int(book_length)) as bar:
for idx, pdf in enumerate(pdf_list):
merger.append(pdf)
bar.update(idx)
merger.write("SVG - "+book_title+".pdf")
merger.close()
for image in image_list:
os.remove(image)
for pdf in pdf_list:
os.remove(pdf)
image_list = []
#
#Download from JPG to PDF
#
print("")
print("--> Downloading from JPG to PDF")
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
r = s.get(book_url, headers=headers)
#print(r.text)
parsed_html = BeautifulSoup(r.text, "lxml")
og_link = parsed_html.head.find('meta', attrs={'property':'og:image'})["content"].replace("1.svgz","").replace("1.jpg","")
book_description = parsed_html.head.find('meta', attrs={'name':'description'})["content"]
pre_book_title = (book_description.find("Title:"))
post_book_title = (book_description.find(", Author"))
book_title = book_description[pre_book_title+7:post_book_title]
pre_book_length = (book_description.find("Length:"))
post_book_length = (book_description.find(" pages,"))
book_length = book_description[pre_book_length+8:post_book_length]
#Download all JPG files
print("Downloading all JPG files...")
for i in progressbar.progressbar(range(int(book_length))):
while True:
try:
r = s.get(og_link+str(i+1)+".jpg",headers=headers)
break
except:
#print("Error")
continue
with open('output'+str(i+1)+".jpg", 'wb') as out_file:
out_file.write(r.content)
image_list.append("output"+str(i+1)+".jpg")
#Join all JPG's
print("Joining all JPG files into final PDF...")
cover = Image.open(image_list[0])
width, height = cover.size
pdf = FPDF(unit = "pt", format = [width, height])
with progressbar.ProgressBar(max_value=int(book_length)) as bar:
for idx, page in enumerate(image_list):
pdf.add_page()
pdf.image(page, 0, 0)
bar.update(idx)
pdf.output("JPG - "+book_title+".pdf", "F")
for image in image_list:
os.remove(image)