-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_crawler.py
64 lines (57 loc) · 1.77 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from bs4 import * # To find the hyperlinks
import requests # To make http requests to a site
from argparse import * #To make a CLI
import os
import re
import mail
import contact
from urllib.parse import urlparse
parser=ArgumentParser()
parser.add_argument("url",help="Enter the URL you want to scrape")
parser.add_argument("depth",help="Enter the depth for search")
parser.add_argument("-e",action = 'store_true',help="Option for email harvesting")
parser.add_argument("-c",action = 'store_true',help="Option for contact harvesting")
p=parser.parse_args()
'''The usearch function searches the html source code for
links and recursively searches further for links based on
given depth'''
mails=set()
contacts=set()
l=[]
def usearch(u,depth):
if p.e:
mail.harv(u,fil_1,mails,set())
if p.c:
contact.harv(u,fil_2,contacts,set())
if depth==int(p.depth):
return 0
r=requests.get(u)
soup=BeautifulSoup(r.content,'html.parser')
links= soup.find_all('a')
for link in links:
link=link.get('href')
if link!='#' and link!=None:
if ("https://" or "http://")not in link:
link = p.url+link
if link not in l:
l.append(link)
for link in l:
f.write(link+"\n")
usearch(link,depth+1)
title=urlparse(p.url)
title=title.netloc
os.system("mkdir {}".format(title))
os.system("touch {}/domains.txt".format(title))
f= open("{}/domains.txt".format(title),"a")
if p.e:
os.system("touch {}/emails.txt".format(title))
fil_1= open("{}/emails.txt".format(title),"a")
if p.c:
os.system("touch {}/contacts.txt".format(title))
fil_2= open("{}/contacts.txt".format(title),"a")
usearch(p.url,0)
f.close()
if p.e:
fil_1.close()
if p.c:
fil_2.close()