fix: scrapers

ajskateboarder · Jan 26, 2024 · 8e7bca8 · 8e7bca8
1 parent 928a1b2
commit 8e7bca8
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 113 deletions.
diff --git a/scripts/generate_reviews.py b/scripts/generate_reviews.py
@@ -4,74 +4,79 @@
 
 import os
 import sys
-from functools import partial
+from math import inf
 from uuid import uuid4
 import logging
 
 from sqlite3worker.sqlite3worker import Sqlite3Worker
 
 from crawling import bestsellers_reviews
-from crawling.dicts import Reviews
+from crawling.items import Reviews
 from wordsmyth import rate
 
 
-logging.basicConfig(
-    format="[%(levelname)s] %(asctime)s: %(message)s",
-    datefmt="%d-%b-%y %H:%M:%S",
-    level=logging.DEBUG,
-    # filename="something.log",
-)
-logging.getLogger("selenium").setLevel(logging.WARNING)
-logging.getLogger("urllib3").setLevel(logging.WARNING)
-
-
 def process_reviews(reviews: Reviews, db: Sqlite3Worker) -> None:
-    productId = reviews["productId"]
-    for review in reviews["items"]:
-        if review["reviewText"].strip() == "":
+    product_id = reviews.product_id
+    for review in reviews.items:
+        if review.text.strip() == "":
             return
         db.execute(
-            f"CREATE TABLE IF NOT EXISTS {productId}(text, actual, prediction, flags)"
+            f"CREATE TABLE IF NOT EXISTS {product_id}(text, actual, prediction, flags)"
         )
 
         try:
             prediction, flags = rate(
-                review["reviewText"]
-                .replace(
+                review.text.replace(
                     "                    The media could not be loaded.\n                ",
                     "",
-                )
-                .strip(),
+                ).strip(),
                 flags=True,
             )
-        except Exception:
+        except Exception as e:
+            logging.error(
+                "Exception raised when attempting to rate %s: %s", review.text, e
+            )
             return
         try:
             db.execute(
-                f"INSERT INTO {productId} VALUES(?, ?, ?, ?)",
+                f"INSERT INTO {product_id} VALUES(?, ?, ?, ?)",
                 (
-                    review["reviewText"],
-                    review["overall"],
+                    review.text,
+                    review.rating,
                     prediction,
                     ",".join(flags),
                 ),
             )
         except AttributeError:
             db.execute(
-                f"INSERT INTO {productId} VALUES(?, ?, ?, ?)",
-                (review["reviewText"], review["overall"], prediction, flags),
+                f"INSERT INTO {product_id} VALUES(?, ?, ?, ?)",
+                (review.text, review.rating, prediction, flags),
             )
 
 
 def main() -> None:
-    HEADLESS = False
+    HEADLESS = True
 
     location = f"{sys.argv[1].split('.')[0]}{str(uuid4())}.sqlite"
-    db = Sqlite3Worker(location)
-    logging.info("Writing reviews to %s", location)
 
-    scraper = bestsellers_reviews(partial(process_reviews, db=db), HEADLESS)
-    scraper(os.environ["EMAIL"], os.environ["PASSWORD"])
+    logging.basicConfig(
+        format="[%(levelname)s] %(asctime)s: %(message)s",
+        datefmt="%d-%b-%y %H:%M:%S",
+        level=logging.DEBUG,
+        filename=f"{location}.log",
+    )
+    logging.getLogger("selenium").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+    db = Sqlite3Worker(location, max_queue_size=inf)
+    print(f"Writing reviews to {location} and logging at {location + '.log'}")
+    print("CTRL+C to exit at any time")
+
+    scraper = bestsellers_reviews(lambda x: process_reviews(x, db), HEADLESS)
+    try:
+        scraper(os.environ["EMAIL"], os.environ["PASSWORD"])
+    except KeyboardInterrupt:
+        sys.exit()
 
 
 if __name__ == "__main__":

diff --git a/src/crawling/dicts.py b/src/crawling/dicts.py
diff --git a/src/crawling/generator.py b/src/crawling/generator.py
@@ -1,8 +1,10 @@
 """Review generation helpers"""
 from __future__ import annotations
-from typing import Callable, Protocol, Any, cast, TYPE_CHECKING
-import subprocess
+
 import logging
+import subprocess
+from itertools import count
+from typing import TYPE_CHECKING, Any, Callable, Protocol, cast
 
 from .sync_reviews import AmazonScraper
 from .threaded_reviews import AmazonScraper as ParallelAmazonScraper
@@ -34,7 +36,11 @@ def scraper(email: str, password: str) -> None:
         with AmazonScraper(headless) as products:
             logging.info("Collecting product IDs")
             product_ids = products.get_bestselling()
-            logging.info("Collected following IDs: %s", ",".join(product_ids))
+            logging.info(
+                "Collected %s following IDs: %s",
+                len(product_ids),
+                ",".join(product_ids),
+            )
 
         logging.info("Initializing review gatherer")
 
@@ -43,11 +49,21 @@ def scraper(email: str, password: str) -> None:
                 scrapers.captcha_hook = kitty_captcha
                 logging.info("Logging scrapers in")
                 scrapers.login(email, password)
-                for product_id in product_ids:
-                    logging.info("Initiating scrape process for: %s", product_id)
-                    logging.info("\tCollecting review proportions")
-                    proportions = prop.get_proportions(product_id)
-                    logging.info("\tScraping")
-                    scrapers.scrape(product_id, callback, proportions)  # type: ignore
+                for i in count(1):
+                    logging.info("Starting round %s of scraping", i)
+                    for product_id in product_ids[:]:
+                        logging.info("Initiating scrape process for: %s", product_id)
+                        logging.info("\tCollecting review proportions")
+                        data = prop.get_extras(product_id)
+                        logging.info(
+                            "Collected %s following IDs: %s",
+                            len(data.products),
+                            ",".join(data.products),
+                        )
+                        logging.info("\tScraping")
+                        scrapers.scrape(product_id, callback, data.proportions)  # type: ignore
+
+                        product_ids.extend(data.products)
+                        product_ids.remove(product_id)
 
     return scraper
diff --git a/src/crawling/items.py b/src/crawling/items.py
@@ -0,0 +1,31 @@
+"""Typings for some JSON structures"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass
+class Review:
+    text: str
+    rating: int
+
+
+@dataclass
+class Reviews:
+    product_id: str
+    items: list[Review]
+
+
+@dataclass
+class Product:
+    title: str
+    asin: str
+    rating: str
+    price: str
+    image: str
+
+
+@dataclass
+class ProductPageInfo:
+    proportions: list[float] | list[int]
+    products: list[str]
diff --git a/src/crawling/sync_reviews.py b/src/crawling/sync_reviews.py
@@ -1,29 +1,30 @@
 """Review downloader
 
-Used in review aggregating to find popular products"""
+Used in review aggregating to find popular products.
+Much of this is unused for scraping products in favor of crawling.threaded_reviews"""
 from __future__ import annotations
 
 import time
-from typing import Any, Generator, Union, cast
+from typing import Any, Generator, cast
 from urllib.parse import urlparse
-from urllib3.exceptions import MaxRetryError
 
 from bs4 import BeautifulSoup
 from selenium.webdriver import Firefox, FirefoxOptions
 from selenium.webdriver.common.by import By
+from urllib3.exceptions import MaxRetryError
 
-from .dicts import Review
 from .exceptions import PrematureBrowserExit
+from .items import ProductPageInfo
 
 
 class AmazonScraper:
     """This implementation uses Firefox and Geckodriver.
 
     `fake_display` creates a virtual display for non-window systems."""
 
-    def __init__(self, fake_display: bool = True) -> None:
+    def __init__(self, headless: bool = True) -> None:
         opts = FirefoxOptions()
-        if fake_display:
+        if headless:
             opts.add_argument("--headless")  # type: ignore
 
         self.browser = Firefox(options=opts)
@@ -43,23 +44,32 @@ def get_bestselling(self) -> list[str]:
                 "Failed to access a browser session. Did you format your 'with' blocks correctly?"
             ) from e
         ids = []
-        for _ in range(3):
-            for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
-                try:
-                    if "product-reviews" not in cast(str, link.get_attribute("href")):
-                        continue
-                    product_id = cast(
-                        str, urlparse(link.get_attribute("href")).path
-                    ).split("/")[2]
-                    ids.append(product_id)
-                except Exception:
-                    break
+        self.browser.execute_script("window.scrollBy(0, document.body.scrollHeight)")  # type: ignore
+        # TODO: speed up this code, it's noticably slow for some reason
+        for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
             try:
-                self.browser.execute_script("window.scrollBy(0, 1000)")  # type: ignore
+                href = link.get_attribute("href")
+                if "product-reviews" not in cast(str, href):
+                    continue
+                ids.append(cast(str, urlparse(href).path).split("/")[2])
             except Exception:
-                pass
+                break
         return list(set(ids))
 
+    @staticmethod
+    def select_reviews(content: Any) -> Generator[dict, None, None]:
+        """Select reviews from an Amazon page source"""
+        for review in content:
+            row = review.select_one(".a-row")
+            if row is not None:
+                rating = int(
+                    row.select_one("i[data-hook='review-star-rating']").text.split(".")[
+                        0
+                    ]
+                )
+                body = row.select_one("span[data-hook='review-body']").text
+                yield {"text": body, "rating": rating}
+
     def fetch_product_reviews(
         self, asin: str, pages: int = 10
     ) -> Generator[dict, None, None]:
@@ -71,24 +81,40 @@ def fetch_product_reviews(
             for item in self.select_reviews(content):
                 yield {**item, "productId": asin}
 
-    def get_proportions(
-        self, asin: str, total: int = 500
-    ) -> Union[list[float], list[int]]:
-        """Return the distribution of reviews to gather from five to one star
+    def get_extras(self, asin: str, total: int = 500) -> ProductPageInfo:
+        """Return the distribution of reviews to gather from five to one star,
+        as well as any IDs for products on the same page
 
         If `total` is None, return the percentages from a product histogram as floats"""
         self.browser.get(f"https://amazon.com/product-reviews/{asin}")
 
         percentages = self.browser.find_element(
             By.CSS_SELECTOR, ".histogram"
         ).text.split("\n")[1::2]
+
         parsed = list(map(lambda p: int(p.replace("%", "")) / 100, percentages))
         if total is None:
             return parsed
+
         parsed = list(map(lambda x: x * 500, parsed))
+
         while any(x > 100 for x in parsed):
             parsed = list(map(lambda x: x * 0.99, parsed))
-        return list(reversed(list(map(lambda x: int(x) + 1, parsed))))
+
+        ids = []
+        self.browser.execute_script("window.scrollBy(0, document.body.scrollHeight)")  # type: ignore
+        for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
+            try:
+                href = link.get_attribute("href")
+                if "/dp/" not in cast(str, href):
+                    continue
+                ids.append(cast(str, urlparse(href).path).split("/")[3])
+            except Exception:
+                break
+
+        return ProductPageInfo(
+            list(reversed(list(map(lambda x: int(x) + 1, parsed)))), list(set(ids))
+        )
 
     def get_product_source(
         self, asin: str, pages: int, delay: float = 0.5
@@ -103,20 +129,6 @@ def get_product_source(
             source = self.browser.page_source
             yield source
 
-    @staticmethod
-    def select_reviews(content: Any) -> Generator[Review, None, None]:
-        """Select reviews from an Amazon page source"""
-        for review in content:
-            row = review.select_one(".a-row")
-            if row is not None:
-                rating = int(
-                    row.select_one("i[data-hook='review-star-rating']").text.split(".")[
-                        0
-                    ]
-                )
-                body = row.select_one("span[data-hook='review-body']").text
-                yield {"text": body, "rating": rating}
-
     def close(self) -> None:
         """Close the browser"""
         self.browser.quit()