-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgutenberg.py
95 lines (80 loc) · 2 KB
/
gutenberg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import urllib
import sys
import os
name = sys.argv[2]
format = sys.argv[3]
f2 = open("style.css", "r")
t2 = f2.read()
titleS = "## "
titleE = "\n" \
+"\n"
paragraphS = ""
paragraphE = "\n" \
+"\n"
if format == "html" or format=="pdf":
f = open(name+".html", "w")
f.write("<html><head><title>"+name+"</title></head><meta charset='UTF-8'/><style>" + t2 + "</style><body><h1>"+name+"</h1>")
titleS = "<h2 id='kapitel"
titleE = "</h2>"
paragraphS = "<p>"
paragraphE = "</p>"
else:
f = open(name+".md", "w")
f.write("# " + name + "\n")
j = 0
base = "https://www.projekt-gutenberg.org/" + sys.argv[1]
print("Titel wird heruntergeladen...")
s = urllib.urlopen(base + "/index.html")
t = s.read()
ul = t.find('ul')
ul2 = t.find("</ul>", ul)
ul = t.find("li", ul)
l = 0
titles = []
urls = []
while True:
ul = t.find("<li>", ul+1, ul2)
href1 = t.find("href", ul)+6
href2 = t.find(">", href1)-1
url = t[href1:href2]
if "TYPE" in url:
break
urls.append(t[href1:href2])
title2 = t.find("</a>", href2)
titles.append(t[href2+2:title2])
if ul < 0:
break
l += 1
if format == "html" or format == "pdf":
f.write("<ul>")
for i in range(l):
f.write("<li><a href='#kapitel" + str(i+1) + "'>" + titles[i] + "</a></li>")
f.write("</ul>")
f.flush()
while j < l:
if titleS == "<h2 id='kapitel":
titleS += str(j+1)+"'>"
f.write(titleS + titles[j] + titleE)
titleS = "<h2 id='kapitel"
else:
f.write(titleS + titles[j] + titleE)
link = base + "/" + urls[j]
s = urllib.urlopen(link)
t = s.read()
b = t.find('Inhalt')
b2 = b
while True:
b2 = t.find("<p>", b)+3
if b2 < 3:
break
b = t.find("</p>", b+1)
if b < 0:
break
f.write(paragraphS + (t[b2:b]) + paragraphE)
f.flush()
j+=1
if format == "html":
f.write("</body></html>")
f.close()
if format == "pdf":
os.system("python convert.py '" + name + "'")