refactor: crawler and add future annotations someplaces

ajskateboarder · Jan 20, 2024 · dd9ff49 · dd9ff49
1 parent 71c49d0
commit dd9ff49
Show file tree

Hide file tree

Showing 11 changed files with 129 additions and 144 deletions.
diff --git a/scripts/generate_reviews.py b/scripts/generate_reviews.py
@@ -7,13 +7,23 @@
 import sys
 from functools import partial
 from threading import Lock
+import logging
 
-import crawling as utils
+from crawling import bestsellers_reviews
+from crawling.dicts import Reviews
 
 from wordsmyth import rate
 
 lock = Lock()
 
+logging.basicConfig(
+    format="[%(levelname)s] %(asctime)s: %(message)s",
+    datefmt="%d-%b-%y %H:%M:%S",
+    level=logging.DEBUG,
+)
+logging.getLogger("selenium").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+
 
 class LockedSqliteConnection:
     """https://stackoverflow.com/a/41206801"""
@@ -28,17 +38,15 @@ def __enter__(self) -> LockedSqliteConnection:
         self.cursor = self.connection.cursor()
         return self
 
-    def __exit__(self, typ, value, traceback) -> None:
+    def __exit__(self, *_) -> None:
         self.lock.release()
         self.connection.commit()
         if self.cursor is not None:
             self.cursor.close()
             self.cursor = None  # type: ignore
 
 
-def process_reviews(
-    reviews: utils.threaded_reviews.Item, db: LockedSqliteConnection
-) -> None:
+def process_reviews(reviews: Reviews, db: LockedSqliteConnection) -> None:
     productId = reviews["productId"]
     with lock:
         for review in reviews["items"]:
@@ -62,8 +70,13 @@ def process_reviews(
                     return
                 try:
                     db.cursor.execute(
-                        f"INSERT INTO {productId} VALUES(?, ?, ?)",
-                        (review["reviewText"], review["overall"], prediction),
+                        f"INSERT INTO {productId} VALUES(?, ?, ?, ?)",
+                        (
+                            review["reviewText"],
+                            review["overall"],
+                            prediction,
+                            ",".join(flags),
+                        ),
                     )
                 except AttributeError:
                     db.cursor = db.connection.cursor()
@@ -74,27 +87,12 @@ def process_reviews(
 
 
 def main() -> None:
-    from loguru import logger
-
-    HEADLESS = True
+    HEADLESS = False
 
     db = LockedSqliteConnection(sys.argv[1])
 
-    with utils.BestSellersLinks(HEADLESS) as products:
-        logger.info("Collecting product IDs")
-        product_ids = list(products.get_bestselling())
-
-    with utils.AmazonScraper(HEADLESS) as prop:
-        with utils.ParallelAmazonScraper(HEADLESS) as scrapers:
-            logger.info("Logging scrapers in")
-            scrapers.login(os.environ["EMAIL"], os.environ["PASSWORD"])
-            # scrapers.scrape(product_id, partial(process_reviews, db=db), proportions)
-            for product_id in product_ids:
-                proportions = prop.get_proportions(product_id)
-                logger.info(f"Collecting review proportions for: {product_id}")
-
-                logger.info(f"Scraping: {product_id}")
-                scrapers.scrape(product_id, partial(process_reviews, db=db), proportions)  # type: ignore
+    scraper = bestsellers_reviews(partial(process_reviews, db=db), HEADLESS)
+    scraper(os.environ["EMAIL"], os.environ["PASSWORD"])
 
 
 if __name__ == "__main__":

diff --git a/src/crawling/__init__.py b/src/crawling/__init__.py
@@ -1,4 +1,2 @@
 """Amazon review collection utilities"""
-from .ids import BestSellersLinks
-from .sync_reviews import AmazonScraper
-from .threaded_reviews import AmazonScraper as ParallelAmazonScraper
+from .generator import bestsellers_reviews
diff --git a/src/crawling/exceptions.py b/src/crawling/exceptions.py
@@ -1,6 +1,10 @@
 from __future__ import annotations
 
 
+class PrematureBrowserExit(Exception):
+    """Raised when browsers close early for whatever reason"""
+
+
 class CAPTCHAError(Exception):
     """Detected by Amazon and requires a CAPTCHA to proceed"""
 

diff --git a/src/crawling/generator.py b/src/crawling/generator.py
@@ -0,0 +1,51 @@
+"""Review generation helpers"""
+from __future__ import annotations
+from typing import Callable, Protocol, Any, cast, TYPE_CHECKING
+import subprocess
+import logging
+
+from .sync_reviews import AmazonScraper
+from .threaded_reviews import AmazonScraper as ParallelAmazonScraper
+
+if TYPE_CHECKING:
+    from selenium.webdriver import Firefox
+
+
+class Scraper(Protocol):
+    def __call__(self, email: str, password: str) -> None:
+        ...
+
+
+def kitty_captcha(browser: Firefox, _: Any) -> str:
+    captcha_image = cast(
+        str,
+        browser.find_element("css selector", "img[alt='captcha']").get_attribute("src"),
+    )
+    subprocess.run(["/usr/bin/kitty", "icat", captcha_image], check=True)
+    return input("(login) Please solve the provided captcha: ")
+
+
+def bestsellers_reviews(callback: Callable, headless: bool) -> Scraper:
+    """Returns a scraping function to scrape reviews from Amazon's bestselling"""
+
+    def scraper(email: str, password: str) -> None:
+        logging.info("Starting product ID gatherer")
+        with AmazonScraper(headless) as products:
+            logging.info("Collecting product IDs")
+            product_ids = products.get_bestselling()
+            logging.info("Collected following IDs: %s", ",".join(product_ids))
+        logging.info("Initializing review gatherer")
+        with AmazonScraper(headless) as prop:
+            with ParallelAmazonScraper(headless) as scrapers:
+                scrapers.captcha_hook = kitty_captcha
+                logging.info("Logging scrapers in")
+                scrapers.login(email, password)
+
+                for product_id in product_ids:
+                    logging.info("Initiating scrape process for: %s", product_id)
+                    logging.info("\tCollecting review proportions")
+                    proportions = prop.get_proportions(product_id)
+                    logging.info("\tScraping")
+                    scrapers.scrape(product_id, callback, proportions)  # type: ignore
+
+    return scraper
diff --git a/src/crawling/ids.py b/src/crawling/ids.py
diff --git a/src/crawling/sync_reviews.py b/src/crawling/sync_reviews.py
@@ -6,12 +6,14 @@
 import time
 from typing import Any, Generator, Union, cast
 from urllib.parse import urlparse
+from urllib3.exceptions import MaxRetryError
 
 from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox, FirefoxOptions
 from selenium.webdriver.common.by import By
 
 from .dicts import Review
+from .exceptions import PrematureBrowserExit
 
 
 class AmazonScraper:
@@ -32,28 +34,31 @@ def __enter__(self) -> AmazonScraper:
     def __exit__(self, *_: Any) -> None:
         self.close()
 
-    def get_bestselling(self) -> Generator[str, None, None]:
+    def get_bestselling(self) -> list[str]:
         """Fetch product IDs from Amazon's Bestsellers page"""
-        self.browser.get("https://www.amazon.com/gp/bestsellers/")
+        try:
+            self.browser.get("https://www.amazon.com/gp/bestsellers/")
+        except MaxRetryError as e:
+            raise PrematureBrowserExit(
+                "Failed to access a browser session. Did you format your 'with' blocks correctly?"
+            ) from e
         ids = []
         for _ in range(3):
             for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
                 try:
-                    if "product-reviews" in cast(str, link.get_attribute("href")):
-                        product_id = cast(
-                            str, urlparse(link.get_attribute("href")).path
-                        ).split("/")[2]
-                        if not product_id in ids:
-                            ids.append(product_id)
-                            yield product_id
-                        else:
-                            continue
+                    if "product-reviews" not in cast(str, link.get_attribute("href")):
+                        continue
+                    product_id = cast(
+                        str, urlparse(link.get_attribute("href")).path
+                    ).split("/")[2]
+                    ids.append(product_id)
                 except Exception:
                     break
             try:
                 self.browser.execute_script("window.scrollBy(0, 1000)")  # type: ignore
             except Exception:
                 pass
+        return list(set(ids))
 
     def fetch_product_reviews(
         self, asin: str, pages: int = 10
@@ -100,7 +105,7 @@ def get_product_source(
 
     @staticmethod
     def select_reviews(content: Any) -> Generator[Review, None, None]:
-        """Select reviews from a Amazon page source"""
+        """Select reviews from an Amazon page source"""
         for review in content:
             row = review.select_one(".a-row")
             if row is not None: