-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
29 lines (23 loc) · 812 Bytes
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import urllib.request
import urllib
from lxml.html import fromstring
import os
import requests
class Crawler:
def __init__(self, base_url):
self.base_url = base_url
self.file_name = None
self.xpath_selector = None
self.page = None
self.sess = requests.Session()
if not os.path.isdir("downloads"):
os.mkdir("downloads")
def download_file(self, url):
urllib.request.urlretrieve(url, "downloads/" + self.file_name)
def get_document(self, url):
self.page = self.sess.get(url)
self.xpath_selector = fromstring(self.page.content)
def follow_redirect(self, url):
self.page = self.sess.get(url, allow_redirects=True)
self.xpath_selector = fromstring(self.page.content)
return self.page.url