updated download_and_preprocess arguments, store model accuracy metrics

merillium · Feb 21, 2024 · 12e5db0 · 12e5db0
1 parent 5786dcd
commit 12e5db0
Show file tree

Hide file tree

Showing 6 changed files with 192 additions and 153 deletions.
diff --git a/README.md b/README.md
@@ -3,13 +3,13 @@
 This is a work-in-progress package that retrieves training data from the [lichess.org open database](https://database.lichess.org/), then trains a statistical model to detect suspicious players. Currently the app is not functional, and has not yet been built.
 
 ### Data Download and Preprocessing
-To download and preprocess data from the lichess.org open database, you can run the following command:
+To download and preprocess data from the lichess.org open database, you can run the following command (specifying the year and month of the data you want to download, and the source of the data to be `lichess-open-database`):
 
 ```bash
-python3 download_and_preprocess_data.py --year 2015 --month 1 --filetype lichess-open-database
+python3 download_and_preprocess.py --year 2015 --month 1 --source lichess-open-database
 ```
 
-The `download_and_preprocess_data.py` script downloads the `.pgn.zst` file corresponding to the month and year specified, decompresses the `.pgn` file, and creates the `lichess_downloaded_games` directory to which both files are saved. Then the script preprocesses the `.pgn` file and extracts relevant features, creates the `lichess_player_data` directory, to which a `.csv` file is saved. By default, all raw files in the `lichess_downloaded_games` directory are then deleted because they are typically large and not needed after preprocessing. (This process can be streamlined by directly reading from the decompressed `.pgn` file instead of first saving it)
+The `download_and_preprocess.py` script downloads the `.pgn.zst` file corresponding to the month and year specified, decompresses the `.pgn` file, and creates the `lichess_downloaded_games` directory to which both files are saved. Then the script preprocesses the `.pgn` file and extracts relevant features, creates the `lichess_player_data` directory, to which a `.csv` file is saved. By default, all raw files in the `lichess_downloaded_games` directory are then deleted because they are typically large and not needed after preprocessing. (This process can be streamlined by directly reading from the decompressed `.pgn` file instead of first saving it)
 
 ### Model Description
 This is a simple statistical model that flags players who have performed a certain threshold above their expected performance under the Glicko-2 rating system. The expected performance takes into account each player's complete game history and opponents in the span of the training data. The thresholds are initialized to default values, and then adjusted separately for each 100 point rating bin in the training data.
@@ -44,9 +44,8 @@ Currently working on unit tests, which can be run with the following command:
 ```make test```, or if you want to run test files individually ```PYTHONPATH=. pytest tests/test_model.py```
 
 To-do:
-- restructure `make_player_features.py` to parse arguments from `download_and_preprocess_data.py`
+- implement progress bars for preprocessing data and model training
 - complete data labelling using lichess API calls, with a workaround or retry request if API rate limiting occurs 
 - write unit tests for scripts that perform feature extraction and data labelling
-- write unit tests for `PlayerAnomalyDetectionModel` class and methods (in-progress)
-- possible benchmarks for length of time to execute data downloading, preprocessing, and model training depending
-on the size of the raw data
+- complete unit tests for `PlayerAnomalyDetectionModel` class and methods (in-progress)
+- possible benchmarks for length of time to execute data downloading, preprocessing, and model training depending on the size of the raw data
diff --git a/download_and_preprocess.py b/download_and_preprocess.py
@@ -3,13 +3,18 @@
 import re
 from pathlib import Path
 import subprocess
-import zstandard as zstd
 import pyzstd
 
 from enums import Folders
 
 
-def download_data(year, month, filetype):
+def download_data(year, month, source):
+    if source != "lichess-open-database":
+        raise ValueError(
+            "Source must be lichess-open-database. Support for additional sources will be added in the future."
+        )
+
+    year = str(year)
     month = str(month).zfill(2)
     url = f"https://database.lichess.org/standard/lichess_db_standard_rated_{year}-{month}.pgn.zst"
     filename = f"lichess_db_standard_rated_{year}-{month}.pgn.zst"
@@ -44,29 +49,15 @@ def download_data(year, month, filetype):
 
 
 def preprocess_data(filename, remove_raw_files):
-    """This function calls parse_pgn.py and make_player_features.py with the filename argument"""
+    """This function calls parse_pgn.py and make_player_features.py with pgn and csv filepaths"""
 
-    base_filename = Path(filename).stem.split(".")[
+    BASE_FILE_NAME = Path(filename).stem.split(".")[
         0
     ]  ## removes .pgn.zst from extension
-    PGN_FILE_PATH = f"{Folders.LICHESS_DOWNLOADED_GAMES.value}/{base_filename}.pgn"
-    ZST_FILE_PATH = f"{Folders.LICHESS_DOWNLOADED_GAMES.value}/{base_filename}.pgn.zst"
+    PGN_FILE_PATH = f"{Folders.LICHESS_DOWNLOADED_GAMES.value}/{BASE_FILE_NAME}.pgn"
+    ZST_FILE_PATH = f"{Folders.LICHESS_DOWNLOADED_GAMES.value}/{BASE_FILE_NAME}.pgn.zst"
 
     # decompress .pgn.zst and save as .pgn
-    # note: for large files, this operation must be done in chunks
-    # with open(ZST_FILE_PATH, "rb") as compressed_file:
-    #     dctx = zstd.ZstdDecompressor()
-    #     with open(PGN_FILE_PATH, "wb") as output_file:
-    #         for chunk in dctx.read_to_iter(compressed_file):
-    #             output_file.write(chunk)
-
-    # with open(ZST_FILE_PATH, 'rb') as compressed_file:
-    #     decompressor = zstd.ZstdDecompressor()
-    #     with decompressor.stream_reader(compressed_file) as reader:
-    #         with open(PGN_FILE_PATH, 'wb') as output_file:
-    #             for chunk in reader:
-    #                 output_file.write(chunk)
-
     with open(ZST_FILE_PATH, "rb") as f_in:
         compressed_data = f_in.read()
 
@@ -77,8 +68,10 @@ def preprocess_data(filename, remove_raw_files):
 
     subprocess.run(["python3", "parse_pgn.py", PGN_FILE_PATH])
 
-    CSV_FILE_PATH = f"{Folders.LICHESS_PLAYER_DATA.value}/{base_filename}.csv"
-    # subprocess.run(["python3", "make_player_features.py", filename[:-4]])
+    CSV_RAW_FEATURES_FILE_PATH = (
+        f"{Folders.LICHESS_PLAYER_DATA.value}/{BASE_FILE_NAME}.csv"
+    )
+    subprocess.run(["python3", "make_player_features.py", CSV_RAW_FEATURES_FILE_PATH])
 
     # Remove the downloaded .pgn.zst and .pgn files
     if remove_raw_files:
@@ -91,9 +84,9 @@ def main():
     parser.add_argument("--year", type=int, help="Year of the data", required=True)
     parser.add_argument("--month", type=int, help="Month of the data", required=True)
     parser.add_argument(
-        "--filetype",
+        "--source",
         type=str,
-        help="Type of file to download",
+        help="Source of the file to download",
         choices=["lichess-open-database"],
         required=True,
     )
@@ -104,7 +97,7 @@ def main():
     )
     args = parser.parse_args()
 
-    filename = download_data(args.year, args.month, args.filetype)
+    filename = download_data(args.year, args.month, args.source)
     preprocess_data(filename, args.remove_raw_files)
 
 

diff --git a/make_player_features.py b/make_player_features.py
@@ -1,113 +1,139 @@
+import argparse
+import os
+from pathlib import Path
 import numpy as np
 import pandas as pd
 
-BASE_FILE_NAME = "lichess_db_standard_rated_2015-01"
-
-all_player_games_df = pd.read_csv(
-    f"lichess_player_data/{BASE_FILE_NAME}.csv",
-    index_col=[0, 1],
-)
-all_player_games_df.index = all_player_games_df.index.set_names(
-    ["player", "time_control"]
-)
-
-## filter out users who have not played enough games
-MIN_GAMES = 30
-all_player_games_filtered_df = all_player_games_df.loc[
-    all_player_games_df.groupby(level=["player", "time_control"]).size() >= MIN_GAMES
-].copy()
-
-## calculate how much someone exceeds expectations: (actual win rate - expected win rate)
-## someone who has a high win rate could just play mostly lower rated opposition
-
-## this is more involved and requires figuring out expected scores for each game
-## e.g. if player 1 is 1500 and player 2 is also 1500, player 1 should have an expected score of 0.5
-## but the exact nature of the curve depends on the glicko-2 rating system
-
-## we use will the following paper: http://www.glicko.net/glicko/glicko2.pdf
-## and this comment left by @chess_in_sgv:
-
-# Let P2 = Expected outcome for player 2. Then:
-# P2 = 1 / (1 + e^-a)
-# with a = g(sqrt(r12+r22)) * (s2-s1))
-# and g(x) = 1/sqrt(1+3x2/pi2)
-
-##  source: https://www.reddit.com/r/chess/comments/i0pnv1/comment/fzrhhwi
-
-
-def g(x):
-    return 1 / np.sqrt(1 + 3 * x**2 / np.pi**2)
-
-
-def get_player_expected_score(
-    player_rating, opponent_rating, player_rd=80.0, opponent_rd=80.0
-):
-    """Returns expected score of player based on player rating, opponent rating, and RDs (if known)."""
-    A = g(np.sqrt(player_rd**2 + opponent_rd**2)) * (player_rating - opponent_rating)
-    return 1 / (1 + np.exp(-A))
-
-
-all_player_games_filtered_df["expected_scores"] = get_player_expected_score(
-    player_rating=all_player_games_filtered_df["ratings"].to_numpy(),
-    opponent_rating=all_player_games_filtered_df["opponent_ratings"].to_numpy(),
-)
-
-## how much better did a player perform than expected?
-all_player_games_filtered_df["performance_difference"] = (
-    all_player_games_filtered_df["actual_scores"]
-    - all_player_games_filtered_df["expected_scores"]
-)
-
-## AGGREGATE GAME RESULTS FEATURES by player + time control
-all_player_features = all_player_games_filtered_df.groupby(
-    level=["player", "time_control"]
-).agg(
-    number_of_games=("ratings", "count"),
-    mean_perf_diff=(
-        "performance_difference",
-        "mean",
-    ),  # this takes into account opponent strength
-    std_perf_diff=(
-        "performance_difference",
-        "std",
-    ),  # this takes into account opponent strength
-    mean_rating=("ratings", "mean"),
-    median_rating=("ratings", "median"),
-    std_rating=("ratings", "std"),
-    mean_opponent_rating=("opponent_ratings", "mean"),
-    # median_opponent_rating=('opponent_ratings', 'median'),
-    # probably not needed, it's unlikely that opponent ratings will be skewed or provisional
-    std_opponent_rating=("opponent_ratings", "std"),
-    mean_rating_gain=("rating_gains", "mean"),
-    std_rating_gain=("rating_gains", "std"),
-    proportion_increment_games=("increments", "mean"),
-)
-
-## some useful red flags for suspicious behavior:
-# (1) consistently performing above expectation
-# (i.e. mean performance difference far from 0.00 with low standard deviation performance difference)
-# we may refine this to drop the low standard deviation performance difference condition
-# (2) high correlation between increment and expectation -- not yet implemented
-# players who perform much better when playing increment are potentially suspicious
-# but there are players who are not that fast with a mouse
-# (3) high proportion of losses on time -- not yet implemented
-# not conclusive by itself, but certainly supporting evidence
-# most players don't want to lose!
-# (4) analysis of move times -- not yet implemented (unknown if such data is available)
-
-min_rating, max_rating = (
-    all_player_features["mean_rating"].min(),
-    all_player_features["mean_rating"].max(),
-)
-min_bin_rating = np.floor(all_player_features["mean_rating"].min() / 100.0) * 100
-max_bin_rating = 100 + np.ceil(all_player_features["mean_rating"].max() / 100.0) * 100
-rating_bins = np.arange(min_bin_rating, max_bin_rating, 100)
-
-## assign rating bin string to each mean_rating
-rating_bin_labels = [f"{str(int(x))} - {str(int(x)+100)}" for x in rating_bins[:-1]]
-all_player_features["rating_bin"] = pd.cut(
-    all_player_features["mean_rating"], rating_bins, right=True, labels=rating_bins[:-1]
-).astype(int)
-
-## save to csv
-all_player_features.to_csv(f"lichess_player_data/{BASE_FILE_NAME}_player_features.csv")
+from enums import Folders
+
+
+def make_player_features(CSV_RAW_FEATURES_FILE_PATH):
+    """Creates features at the player + time control level from the CSV file containing raw features."""
+
+    all_player_games_df = pd.read_csv(
+        CSV_RAW_FEATURES_FILE_PATH,
+        index_col=[0, 1],
+    )
+    all_player_games_df.index = all_player_games_df.index.set_names(
+        ["player", "time_control"]
+    )
+
+    ## filter out users who have not played enough games
+    MIN_GAMES = 30
+    all_player_games_filtered_df = all_player_games_df.loc[
+        all_player_games_df.groupby(level=["player", "time_control"]).size()
+        >= MIN_GAMES
+    ].copy()
+
+    ## calculate how much someone exceeds expectations: (actual win rate - expected win rate)
+    ## someone who has a high win rate could just play mostly lower rated opposition
+
+    ## this is more involved and requires figuring out expected scores for each game
+    ## e.g. if player 1 is 1500 and player 2 is also 1500, player 1 should have an expected score of 0.5
+    ## but the exact nature of the curve depends on the glicko-2 rating system
+
+    ## we use will the following paper: http://www.glicko.net/glicko/glicko2.pdf
+    ## and this comment left by @chess_in_sgv:
+
+    # Let P2 = Expected outcome for player 2. Then:
+    # P2 = 1 / (1 + e^-a)
+    # with a = g(sqrt(r12+r22)) * (s2-s1))
+    # and g(x) = 1/sqrt(1+3x2/pi2)
+
+    ##  source: https://www.reddit.com/r/chess/comments/i0pnv1/comment/fzrhhwi
+
+    def g(x):
+        return 1 / np.sqrt(1 + 3 * x**2 / np.pi**2)
+
+    def get_player_expected_score(
+        player_rating, opponent_rating, player_rd=80.0, opponent_rd=80.0
+    ):
+        """Returns expected score of player based on player rating, opponent rating, and RDs (if known)."""
+        A = g(np.sqrt(player_rd**2 + opponent_rd**2)) * (
+            player_rating - opponent_rating
+        )
+        return 1 / (1 + np.exp(-A))
+
+    all_player_games_filtered_df["expected_scores"] = get_player_expected_score(
+        player_rating=all_player_games_filtered_df["ratings"].to_numpy(),
+        opponent_rating=all_player_games_filtered_df["opponent_ratings"].to_numpy(),
+    )
+
+    ## how much better did a player perform than expected?
+    all_player_games_filtered_df["performance_difference"] = (
+        all_player_games_filtered_df["actual_scores"]
+        - all_player_games_filtered_df["expected_scores"]
+    )
+
+    ## AGGREGATE GAME RESULTS FEATURES by player + time control
+    all_player_features = all_player_games_filtered_df.groupby(
+        level=["player", "time_control"]
+    ).agg(
+        number_of_games=("ratings", "count"),
+        mean_perf_diff=(
+            "performance_difference",
+            "mean",
+        ),  # this takes into account opponent strength
+        std_perf_diff=(
+            "performance_difference",
+            "std",
+        ),  # this takes into account opponent strength
+        mean_rating=("ratings", "mean"),
+        median_rating=("ratings", "median"),
+        std_rating=("ratings", "std"),
+        mean_opponent_rating=("opponent_ratings", "mean"),
+        # median_opponent_rating=('opponent_ratings', 'median'),
+        # probably not needed, it's unlikely that opponent ratings will be skewed or provisional
+        std_opponent_rating=("opponent_ratings", "std"),
+        mean_rating_gain=("rating_gains", "mean"),
+        std_rating_gain=("rating_gains", "std"),
+        proportion_increment_games=("increments", "mean"),
+    )
+
+    ## some useful red flags for suspicious behavior:
+    # (1) consistently performing above expectation
+    # (i.e. mean performance difference far from 0.00 with low standard deviation performance difference)
+    # we may refine this to drop the low standard deviation performance difference condition
+    # (2) high correlation between increment and expectation -- not yet implemented
+    # players who perform much better when playing increment are potentially suspicious
+    # but there are players who are not that fast with a mouse
+    # (3) high proportion of losses on time -- not yet implemented
+    # not conclusive by itself, but certainly supporting evidence
+    # most players don't want to lose!
+    # (4) analysis of move times -- not yet implemented (unknown if such data is available)
+
+    min_rating, max_rating = (
+        all_player_features["mean_rating"].min(),
+        all_player_features["mean_rating"].max(),
+    )
+    min_bin_rating = np.floor(all_player_features["mean_rating"].min() / 100.0) * 100
+    max_bin_rating = (
+        100 + np.ceil(all_player_features["mean_rating"].max() / 100.0) * 100
+    )
+    rating_bins = np.arange(min_bin_rating, max_bin_rating, 100)
+
+    ## assign rating bin string to each mean_rating
+    rating_bin_labels = [f"{str(int(x))} - {str(int(x)+100)}" for x in rating_bins[:-1]]
+    all_player_features["rating_bin"] = pd.cut(
+        all_player_features["mean_rating"],
+        rating_bins,
+        right=True,
+        labels=rating_bins[:-1],
+    ).astype(int)
+
+    ## save to csv
+    BASE_FILE_NAME = Path(CSV_RAW_FEATURES_FILE_PATH).stem.split(".")[0]
+    all_player_features.to_csv(
+        f"{Folders.LICHESS_PLAYER_DATA.value}/{BASE_FILE_NAME}_player_features.csv"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Create player features from CSV file")
+    parser.add_argument(
+        "CSV_RAW_FEATURES_FILE_PATH", type=str, help="Path to the CSV file"
+    )
+    args = parser.parse_args()
+
+    ## create features from the CSV file
+    make_player_features(args.CSV_RAW_FEATURES_FILE_PATH)