-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathscrape.py
69 lines (57 loc) · 1.78 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import argparse
import requests
import os
from urllib.parse import urlparse
from collections import defaultdict
from bs4 import BeautifulSoup
import json
parser = argparse.ArgumentParser()
parser.add_argument("--site", type=str, required=True)
parser.add_argument("--depth", type=int, default=3)
def cleanUrl(url: str):
return url.replace("https://", "").replace("/", "-").replace(".", "_")
def get_response_and_save(url: str):
response = requests.get(url)
if not os.path.exists("./scrape"):
os.mkdir("./scrape")
parsedUrl = cleanUrl(url)
with open("./scrape/" + parsedUrl + ".html", "wb") as f:
f.write(response.content)
return response
def scrape_links(
scheme: str,
origin: str,
path: str,
depth=3,
sitemap: dict = defaultdict(lambda: ""),
):
siteUrl = scheme + "://" + origin + path
cleanedUrl = cleanUrl(siteUrl)
if depth < 0:
return
if sitemap[cleanedUrl] != "":
return
sitemap[cleanedUrl] = siteUrl
response = get_response_and_save(siteUrl)
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("a")
for link in links:
href = urlparse(link.get("href"))
if (href.netloc != origin and href.netloc != "") or (
href.scheme != "" and href.scheme != "https"
):
continue
scrape_links(
href.scheme or "https",
href.netloc or origin,
href.path,
depth=depth - 1,
sitemap=sitemap,
)
return sitemap
if __name__ == "__main__":
args = parser.parse_args()
url = urlparse(args.site)
sitemap = scrape_links(url.scheme, url.netloc, url.path, depth=args.depth)
with open("./scrape/sitemap.json", "w") as f:
f.write(json.dumps(sitemap))