-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFAQ_parse.py
96 lines (75 loc) · 2.14 KB
/
FAQ_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Create a script that will take a URL, get the HTML from the URL, parse the HTML and generate
FAQpage schema.org markup
The questions are in <h3> tags that end in a "?", the answers are in the sibling <p> tags until the next <h3> tag.
"""
import requests
from bs4 import BeautifulSoup
import json
# Get the HTML from the URL
print("Input URL:")
url = input()
r = requests.get(url)
html = r.text
# Parse the HTML
soup = BeautifulSoup(html, "html.parser")
# Get the questions and answers, turn them into plain text removing all formatting, turn punctuation in the answers into HTML entities
questions = []
answers = []
for h3 in soup.find_all("h3"):
if h3.text.endswith("?"):
questions.append(h3.text)
answer = ""
for sibling in h3.next_siblings:
if sibling.name == "h3":
break
if sibling.name == "p":
answer += sibling.text
answers.append(answer)
# Turn the punctuation in the answers into HTML entities
answers = [a.replace("?", "?") for a in answers]
answers = [a.replace("!", "!") for a in answers]
answers = [a.replace(".", ".") for a in answers]
answers = [a.replace('"', """) for a in answers]
# Create the FAQPage schema.org markup
# Create the FAQPage schema.org markup
faqpage = """<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "FAQPage",
"mainEntity": [
"""
# Create the question and answer markup
for i in range(len(questions)):
faqpage += """ {
"@type": "Question",
"name": "%s",
"acceptedAnswer": {
"@type": "Answer",
"text": "%s"
}
},
""" % (
questions[i],
answers[i],
)
# Remove the last comma
faqpage = faqpage[:-2]
# Close the FAQPage schema.org markup
faqpage += """ ]
}
</script>"""
# validate the json
def validateJSON(jsondata):
try:
json.loads(jsondata)
except ValueError as err:
return False
return True
validation = validateJSON(faqpage)
if validation == True:
print(faqpage)
else:
print("The generated JSON is not valid")
# # Print the FAQPage schema.org markup
# print(faqpage)