Skip to content

Commit

Permalink
fix: scrapers
Browse files Browse the repository at this point in the history
  • Loading branch information
ajskateboarder committed Jan 26, 2024
1 parent 928a1b2 commit 8e7bca8
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 113 deletions.
67 changes: 36 additions & 31 deletions scripts/generate_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,74 +4,79 @@

import os
import sys
from functools import partial
from math import inf
from uuid import uuid4
import logging

from sqlite3worker.sqlite3worker import Sqlite3Worker

from crawling import bestsellers_reviews
from crawling.dicts import Reviews
from crawling.items import Reviews
from wordsmyth import rate


logging.basicConfig(
format="[%(levelname)s] %(asctime)s: %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
level=logging.DEBUG,
# filename="something.log",
)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)


def process_reviews(reviews: Reviews, db: Sqlite3Worker) -> None:
productId = reviews["productId"]
for review in reviews["items"]:
if review["reviewText"].strip() == "":
product_id = reviews.product_id
for review in reviews.items:
if review.text.strip() == "":
return
db.execute(
f"CREATE TABLE IF NOT EXISTS {productId}(text, actual, prediction, flags)"
f"CREATE TABLE IF NOT EXISTS {product_id}(text, actual, prediction, flags)"
)

try:
prediction, flags = rate(
review["reviewText"]
.replace(
review.text.replace(
" The media could not be loaded.\n ",
"",
)
.strip(),
).strip(),
flags=True,
)
except Exception:
except Exception as e:
logging.error(
"Exception raised when attempting to rate %s: %s", review.text, e
)
return
try:
db.execute(
f"INSERT INTO {productId} VALUES(?, ?, ?, ?)",
f"INSERT INTO {product_id} VALUES(?, ?, ?, ?)",
(
review["reviewText"],
review["overall"],
review.text,
review.rating,
prediction,
",".join(flags),
),
)
except AttributeError:
db.execute(
f"INSERT INTO {productId} VALUES(?, ?, ?, ?)",
(review["reviewText"], review["overall"], prediction, flags),
f"INSERT INTO {product_id} VALUES(?, ?, ?, ?)",
(review.text, review.rating, prediction, flags),
)


def main() -> None:
HEADLESS = False
HEADLESS = True

location = f"{sys.argv[1].split('.')[0]}{str(uuid4())}.sqlite"
db = Sqlite3Worker(location)
logging.info("Writing reviews to %s", location)

scraper = bestsellers_reviews(partial(process_reviews, db=db), HEADLESS)
scraper(os.environ["EMAIL"], os.environ["PASSWORD"])
logging.basicConfig(
format="[%(levelname)s] %(asctime)s: %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
level=logging.DEBUG,
filename=f"{location}.log",
)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

db = Sqlite3Worker(location, max_queue_size=inf)
print(f"Writing reviews to {location} and logging at {location + '.log'}")
print("CTRL+C to exit at any time")

scraper = bestsellers_reviews(lambda x: process_reviews(x, db), HEADLESS)
try:
scraper(os.environ["EMAIL"], os.environ["PASSWORD"])
except KeyboardInterrupt:
sys.exit()


if __name__ == "__main__":
Expand Down
21 changes: 0 additions & 21 deletions src/crawling/dicts.py

This file was deleted.

34 changes: 25 additions & 9 deletions src/crawling/generator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Review generation helpers"""
from __future__ import annotations
from typing import Callable, Protocol, Any, cast, TYPE_CHECKING
import subprocess

import logging
import subprocess
from itertools import count
from typing import TYPE_CHECKING, Any, Callable, Protocol, cast

from .sync_reviews import AmazonScraper
from .threaded_reviews import AmazonScraper as ParallelAmazonScraper
Expand Down Expand Up @@ -34,7 +36,11 @@ def scraper(email: str, password: str) -> None:
with AmazonScraper(headless) as products:
logging.info("Collecting product IDs")
product_ids = products.get_bestselling()
logging.info("Collected following IDs: %s", ",".join(product_ids))
logging.info(
"Collected %s following IDs: %s",
len(product_ids),
",".join(product_ids),
)

logging.info("Initializing review gatherer")

Expand All @@ -43,11 +49,21 @@ def scraper(email: str, password: str) -> None:
scrapers.captcha_hook = kitty_captcha
logging.info("Logging scrapers in")
scrapers.login(email, password)
for product_id in product_ids:
logging.info("Initiating scrape process for: %s", product_id)
logging.info("\tCollecting review proportions")
proportions = prop.get_proportions(product_id)
logging.info("\tScraping")
scrapers.scrape(product_id, callback, proportions) # type: ignore
for i in count(1):
logging.info("Starting round %s of scraping", i)
for product_id in product_ids[:]:
logging.info("Initiating scrape process for: %s", product_id)
logging.info("\tCollecting review proportions")
data = prop.get_extras(product_id)
logging.info(
"Collected %s following IDs: %s",
len(data.products),
",".join(data.products),
)
logging.info("\tScraping")
scrapers.scrape(product_id, callback, data.proportions) # type: ignore

product_ids.extend(data.products)
product_ids.remove(product_id)

return scraper
31 changes: 31 additions & 0 deletions src/crawling/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Typings for some JSON structures"""
from __future__ import annotations

from dataclasses import dataclass


@dataclass
class Review:
text: str
rating: int


@dataclass
class Reviews:
product_id: str
items: list[Review]


@dataclass
class Product:
title: str
asin: str
rating: str
price: str
image: str


@dataclass
class ProductPageInfo:
proportions: list[float] | list[int]
products: list[str]
88 changes: 50 additions & 38 deletions src/crawling/sync_reviews.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
"""Review downloader
Used in review aggregating to find popular products"""
Used in review aggregating to find popular products.
Much of this is unused for scraping products in favor of crawling.threaded_reviews"""
from __future__ import annotations

import time
from typing import Any, Generator, Union, cast
from typing import Any, Generator, cast
from urllib.parse import urlparse
from urllib3.exceptions import MaxRetryError

from bs4 import BeautifulSoup
from selenium.webdriver import Firefox, FirefoxOptions
from selenium.webdriver.common.by import By
from urllib3.exceptions import MaxRetryError

from .dicts import Review
from .exceptions import PrematureBrowserExit
from .items import ProductPageInfo


class AmazonScraper:
"""This implementation uses Firefox and Geckodriver.
`fake_display` creates a virtual display for non-window systems."""

def __init__(self, fake_display: bool = True) -> None:
def __init__(self, headless: bool = True) -> None:
opts = FirefoxOptions()
if fake_display:
if headless:
opts.add_argument("--headless") # type: ignore

self.browser = Firefox(options=opts)
Expand All @@ -43,23 +44,32 @@ def get_bestselling(self) -> list[str]:
"Failed to access a browser session. Did you format your 'with' blocks correctly?"
) from e
ids = []
for _ in range(3):
for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
try:
if "product-reviews" not in cast(str, link.get_attribute("href")):
continue
product_id = cast(
str, urlparse(link.get_attribute("href")).path
).split("/")[2]
ids.append(product_id)
except Exception:
break
self.browser.execute_script("window.scrollBy(0, document.body.scrollHeight)") # type: ignore
# TODO: speed up this code, it's noticably slow for some reason
for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
try:
self.browser.execute_script("window.scrollBy(0, 1000)") # type: ignore
href = link.get_attribute("href")
if "product-reviews" not in cast(str, href):
continue
ids.append(cast(str, urlparse(href).path).split("/")[2])
except Exception:
pass
break
return list(set(ids))

@staticmethod
def select_reviews(content: Any) -> Generator[dict, None, None]:
"""Select reviews from an Amazon page source"""
for review in content:
row = review.select_one(".a-row")
if row is not None:
rating = int(
row.select_one("i[data-hook='review-star-rating']").text.split(".")[
0
]
)
body = row.select_one("span[data-hook='review-body']").text
yield {"text": body, "rating": rating}

def fetch_product_reviews(
self, asin: str, pages: int = 10
) -> Generator[dict, None, None]:
Expand All @@ -71,24 +81,40 @@ def fetch_product_reviews(
for item in self.select_reviews(content):
yield {**item, "productId": asin}

def get_proportions(
self, asin: str, total: int = 500
) -> Union[list[float], list[int]]:
"""Return the distribution of reviews to gather from five to one star
def get_extras(self, asin: str, total: int = 500) -> ProductPageInfo:
"""Return the distribution of reviews to gather from five to one star,
as well as any IDs for products on the same page
If `total` is None, return the percentages from a product histogram as floats"""
self.browser.get(f"https://amazon.com/product-reviews/{asin}")

percentages = self.browser.find_element(
By.CSS_SELECTOR, ".histogram"
).text.split("\n")[1::2]

parsed = list(map(lambda p: int(p.replace("%", "")) / 100, percentages))
if total is None:
return parsed

parsed = list(map(lambda x: x * 500, parsed))

while any(x > 100 for x in parsed):
parsed = list(map(lambda x: x * 0.99, parsed))
return list(reversed(list(map(lambda x: int(x) + 1, parsed))))

ids = []
self.browser.execute_script("window.scrollBy(0, document.body.scrollHeight)") # type: ignore
for link in self.browser.find_elements(By.CSS_SELECTOR, "a.a-link-normal"):
try:
href = link.get_attribute("href")
if "/dp/" not in cast(str, href):
continue
ids.append(cast(str, urlparse(href).path).split("/")[3])
except Exception:
break

return ProductPageInfo(
list(reversed(list(map(lambda x: int(x) + 1, parsed)))), list(set(ids))
)

def get_product_source(
self, asin: str, pages: int, delay: float = 0.5
Expand All @@ -103,20 +129,6 @@ def get_product_source(
source = self.browser.page_source
yield source

@staticmethod
def select_reviews(content: Any) -> Generator[Review, None, None]:
"""Select reviews from an Amazon page source"""
for review in content:
row = review.select_one(".a-row")
if row is not None:
rating = int(
row.select_one("i[data-hook='review-star-rating']").text.split(".")[
0
]
)
body = row.select_one("span[data-hook='review-body']").text
yield {"text": body, "rating": rating}

def close(self) -> None:
"""Close the browser"""
self.browser.quit()
Loading

0 comments on commit 8e7bca8

Please sign in to comment.