-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExample_CrawleyDynamicWebPage.py
126 lines (82 loc) · 3.2 KB
/
Example_CrawleyDynamicWebPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
'''
We can parse the JSON string to extract the necessary data, or others ways to crawley dynamic web pages are two:
1) Manually locating the data and parsing JSON string.
2) Using headless browsers to execute the page’s internal JavaScript
(e.g., Selenium or Pyppeteer, an unofficial Python port of Puppeteer).
To need install Selenium and Webdriver Manager, to execute in bash or cmd:
pip install selenium webdriver-manager
'''
# Dynamic Web Scraping With Python Using Beautiful Soup
# Scraping Dynamic Web Pages in Python Using Selenium
'''import requests
url = 'https://angular.io/'
response = requests.get(url)
html = response.text
print(html)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://angular.io/'
#url = 'https://www.lottoland.com/br'
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()))
driver.get(url)
print(driver.page_source)
# Scraping Dynamic Web Pages in Python Using Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
# instantiate options
options = webdriver.ChromeOptions()
# run browser in headless mode
options.headless = True
# instantiate driver
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()), options=options)
# load website
url = 'https://angular.io/'
# get the entire website content
driver.get(url)
# select elements by class name
elements = driver.find_elements(By.CLASS_NAME, 'text-container')
for title in elements:
# select H2s, within element, by tag name
heading = title.find_element(By.TAG_NAME, 'h2').text
# print H2s
print(heading)'''
#How to Scrape Infinite Scroll Web Pages With Selenium and Python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.headless = True
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()), options=options)
# load target website
url = 'https://scrapingclub.com/exercise/list_infinite_scroll/'
# get website content
driver.get(url)
# instantiate items
items = []
# instantiate height of webpage
last_height = driver.execute_script('return document.body.scrollHeight')
# set target count
itemTargetCount = 20
# scroll to bottom of webpage
while itemTargetCount > len(items):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# wait for content to load
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
last_height == new_height
# select elements by XPath
elements = driver.find_elements(By.XPATH, "//div[@class='card-body']/h4/a")
h4_texts = [element.text for element in elements]
items.extend(h4_texts)
# print title
print(h4_texts)