Skip to content

Commit

Permalink
refactor: crawler and add future annotations someplaces
Browse files Browse the repository at this point in the history
  • Loading branch information
ajskateboarder committed Jan 20, 2024
1 parent 71c49d0 commit dd9ff49
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 144 deletions.
48 changes: 23 additions & 25 deletions scripts/generate_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,23 @@
import sys
from functools import partial
from threading import Lock
import logging

import crawling as utils
from crawling import bestsellers_reviews
from crawling.dicts import Reviews

from wordsmyth import rate

lock = Lock()

logging.basicConfig(
format="[%(levelname)s] %(asctime)s: %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
level=logging.DEBUG,
)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)


class LockedSqliteConnection:
"""https://stackoverflow.com/a/41206801"""
Expand All @@ -28,17 +38,15 @@ def __enter__(self) -> LockedSqliteConnection:
self.cursor = self.connection.cursor()
return self

def __exit__(self, typ, value, traceback) -> None:
def __exit__(self, *_) -> None:
self.lock.release()
self.connection.commit()
if self.cursor is not None:
self.cursor.close()
self.cursor = None # type: ignore


def process_reviews(
reviews: utils.threaded_reviews.Item, db: LockedSqliteConnection
) -> None:
def process_reviews(reviews: Reviews, db: LockedSqliteConnection) -> None:
productId = reviews["productId"]
with lock:
for review in reviews["items"]:
Expand All @@ -62,8 +70,13 @@ def process_reviews(
return
try:
db.cursor.execute(
f"INSERT INTO {productId} VALUES(?, ?, ?)",
(review["reviewText"], review["overall"], prediction),
f"INSERT INTO {productId} VALUES(?, ?, ?, ?)",
(
review["reviewText"],
review["overall"],
prediction,
",".join(flags),
),
)
except AttributeError:
db.cursor = db.connection.cursor()
Expand All @@ -74,27 +87,12 @@ def process_reviews(


def main() -> None:
from loguru import logger

HEADLESS = True
HEADLESS = False

db = LockedSqliteConnection(sys.argv[1])

with utils.BestSellersLinks(HEADLESS) as products:
logger.info("Collecting product IDs")
product_ids = list(products.get_bestselling())

with utils.AmazonScraper(HEADLESS) as prop:
with utils.ParallelAmazonScraper(HEADLESS) as scrapers:
logger.info("Logging scrapers in")
scrapers.login(os.environ["EMAIL"], os.environ["PASSWORD"])
# scrapers.scrape(product_id, partial(process_reviews, db=db), proportions)
for product_id in product_ids:
proportions = prop.get_proportions(product_id)
logger.info(f"Collecting review proportions for: {product_id}")

logger.info(f"Scraping: {product_id}")
scrapers.scrape(product_id, partial(process_reviews, db=db), proportions) # type: ignore
scraper = bestsellers_reviews(partial(process_reviews, db=db), HEADLESS)
scraper(os.environ["EMAIL"], os.environ["PASSWORD"])


if __name__ == "__main__":
Expand Down
4 changes: 1 addition & 3 deletions src/crawling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
"""Amazon review collection utilities"""
from .ids import BestSellersLinks
from .sync_reviews import AmazonScraper
from .threaded_reviews import AmazonScraper as ParallelAmazonScraper
from .generator import bestsellers_reviews
4 changes: 4 additions & 0 deletions src/crawling/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from __future__ import annotations


class PrematureBrowserExit(Exception):
"""Raised when browsers close early for whatever reason"""


class CAPTCHAError(Exception):
"""Detected by Amazon and requires a CAPTCHA to proceed"""

Expand Down
51 changes: 51 additions & 0 deletions src/crawling/generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Review generation helpers"""
from __future__ import annotations
from typing import Callable, Protocol, Any, cast, TYPE_CHECKING
import subprocess
import logging

from .sync_reviews import AmazonScraper
from .threaded_reviews import AmazonScraper as ParallelAmazonScraper

if TYPE_CHECKING:
from selenium.webdriver import Firefox


class Scraper(Protocol):
def __call__(self, email: str, password: str) -> None:
...


def kitty_captcha(browser: Firefox, _: Any) -> str:
captcha_image = cast(
str,
browser.find_element("css selector", "img[alt='captcha']").get_attribute("src"),
)
subprocess.run(["/usr/bin/kitty", "icat", captcha_image], check=True)
return input("(login) Please solve the provided captcha: ")


def bestsellers_reviews(callback: Callable, headless: bool) -> Scraper:
"""Returns a scraping function to scrape reviews from Amazon's bestselling"""

def scraper(email: str, password: str) -> None:
logging.info("Starting product ID gatherer")
with AmazonScraper(headless) as products:
logging.info("Collecting product IDs")
product_ids = products.get_bestselling()
logging.info("Collected following IDs: %s", ",".join(product_ids))
logging.info("Initializing review gatherer")
with AmazonScraper(headless) as prop:
with ParallelAmazonScraper(headless) as scrapers:
scrapers.captcha_hook = kitty_captcha
logging.info("Logging scrapers in")
scrapers.login(email, password)

for product_id in product_ids:
logging.info("Initiating scrape process for: %s", product_id)
logging.info("\tCollecting review proportions")
proportions = prop.get_proportions(product_id)
logging.info("\tScraping")
scrapers.scrape(product_id, callback, proportions) # type: ignore

return scraper
83 changes: 0 additions & 83 deletions src/crawling/ids.py

This file was deleted.

29 changes: 17 additions & 12 deletions src/crawling/sync_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
import time
from typing import Any, Generator, Union, cast
from urllib.parse import urlparse
from urllib3.exceptions import MaxRetryError

from bs4 import BeautifulSoup
from selenium.webdriver import Firefox, FirefoxOptions
from selenium.webdriver.common.by import By

from .dicts import Review
from .exceptions import PrematureBrowserExit


class AmazonScraper:
Expand All @@ -32,28 +34,31 @@ def __enter__(self) -> AmazonScraper:
def __exit__(self, *_: Any) -> None:
self.close()

def get_bestselling(self) -> Generator[str, None, None]:
def get_bestselling(self) -> list[str]:
"""Fetch product IDs from Amazon's Bestsellers page"""
self.browser.get("https://www.amazon.com/gp/bestsellers/")
try:
self.browser.get("https://www.amazon.com/gp/bestsellers/")
except MaxRetryError as e:
raise PrematureBrowserExit(
"Failed to access a browser session. Did you format your 'with' blocks correctly?"
) from e
ids = []
for _ in range(3):
for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
try:
if "product-reviews" in cast(str, link.get_attribute("href")):
product_id = cast(
str, urlparse(link.get_attribute("href")).path
).split("/")[2]
if not product_id in ids:
ids.append(product_id)
yield product_id
else:
continue
if "product-reviews" not in cast(str, link.get_attribute("href")):
continue
product_id = cast(
str, urlparse(link.get_attribute("href")).path
).split("/")[2]
ids.append(product_id)
except Exception:
break
try:
self.browser.execute_script("window.scrollBy(0, 1000)") # type: ignore
except Exception:
pass
return list(set(ids))

def fetch_product_reviews(
self, asin: str, pages: int = 10
Expand Down Expand Up @@ -100,7 +105,7 @@ def get_product_source(

@staticmethod
def select_reviews(content: Any) -> Generator[Review, None, None]:
"""Select reviews from a Amazon page source"""
"""Select reviews from an Amazon page source"""
for review in content:
row = review.select_one(".a-row")
if row is not None:
Expand Down
Loading

0 comments on commit dd9ff49

Please sign in to comment.