-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle_extraction.py
98 lines (77 loc) · 2.78 KB
/
article_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Article Extraction
A collection of functions that can be used for extracting HTML and article text from a URL.
Author: Ashley Williams
"""
import urllib3
import certifi
import sys
from pattern.web import URL, plaintext, GET
def get_html(url):
"""
Given a URL, will return the HTML using urllib3.
:param url: The url to extract the HTML from
:return: If extracted successfully, the HTML is returned. If there is a failure, a message with HTTP status. If an exception is thrown, -1 is returned witha description of the error
"""
try:
# urllib3.disable_warnings()
# Try with new where function, but sometimes it failes
# so then try old where function
# Read more: https://github.com/certifi/python-certifi#usage
try:
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where()
)
except:
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.old_where()
)
r = http.request('GET', url, timeout=5.0)
if str(r.status).startswith("2"):
html = r.data.decode("utf-8")
return html
else:
return "Failed to get html, status: " + str(r.status)
except Exception as e:
sys.stdout.write(str(e))
return "-1: " + str(e)
def pattern_article_extraction(url):
"""
Extract the article using Pattern. Pattern uses the url, not the HTML
Args:
url: The url to extract the HTML from
"""
try:
url = URL(url, method=GET)
r = url.download(unicode=True)
content_string = str(r) # default is utf-8
content_string = content_string.strip()
content_string = content_string.replace("\n", "")
content_string = content_string.replace("\t", "")
content_string = content_string.replace("\r", "")
extracted_text = plaintext(content_string)
extracted_html = plaintext(content_string, keep={"h1": [], "h2": [], "strong": [], "a": ["href"]})
# print(type(extracted_html))
return {
"extracted_text": extracted_text,
"extracted_html": extracted_html,
"mime_type": url.mimetype
}
except Exception as e:
sys.stdout.write(str(e))
return e
def full_extraction(url):
"""
Runs a complete end-to-end extraction using all other functions.
:param url: The url to extract the HTML from
:return: An object that contain the HTML from the article
"""
full_html = get_html(url)
pattern_extraction = pattern_article_extraction(url)
return {
"url": url,
"html": full_html,
"pattern_extraction": pattern_extraction
}