diff --git a/.gitignore b/.gitignore index 5655d93..10f023b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ **/__pycache__/ !/.gitignore lichess_player_data/ -lichess-games-database/ +lichess_downloaded_games/ exploratory_plots/ model_plots/ saved_models/ \ No newline at end of file diff --git a/README.md b/README.md index 559e9b8..294abfa 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,15 @@ # filter_suspicious_players -This is a work-in-progress package that retrieves training data from the [lichess.org open database](https://database.lichess.org/), then trains a statistical model to detect suspicious players. +This is a work-in-progress package that retrieves training data from the [lichess.org open database](https://database.lichess.org/), then trains a statistical model to detect suspicious players. Currently the app is not functional, and has not yet been built. -Currently the app is not functional, and has not been deployed. If cloning this repo for personal use, the structure of the python scripts assumes that there is a folder called `lichess-games-database` to which .pgn and .pgn.zst files are downloaded and unzipped (this may be automated in the future using a bash script), and that there is a folder called `lichess_player_data` to which .csv files are saved (this folder is created by `parse_pgn.py` if it doesn't exist). +### Data Download and Preprocessing +To download and preprocess data from the lichess.org open database, you can run the following command: + +```bash +python3 download_and_preprocess_data.py --year 2015 --month 1 --filetype lichess-open-database +``` + +The `download_and_preprocess_data.py` script downloads the `.pgn.zst` file corresponding to the month and year specified, decompresses the `.pgn` file, and creates the `lichess_downloaded_games` directory to which both files are saved. Then the script preprocesses the `.pgn` file and extracts relevant features, creates the `lichess_player_data` directory, to which a `.csv` file is saved. By default, all raw files in the `lichess_downloaded_games` directory are then deleted because they are typically large and not needed after preprocessing. (This process can be streamlined by directly reading from the decompressed `.pgn` file instead of first saving it) ### Model Description This is a simple statistical model that flags players who have performed a certain threshold above their expected performance under the Glicko-2 rating system. The expected performance takes into account each player's complete game history and opponents in the span of the training data. The thresholds are initialized to default values, and then adjusted separately for each 100 point rating bin in the training data. @@ -37,7 +44,9 @@ Currently working on unit tests, which can be run with the following command: ```make test```, or if you want to run test files individually ```PYTHONPATH=. pytest tests/test_model.py``` To-do: -- write a bash script to download and unzip data from the lichess.org open database +- restructure `make_player_features.py` to parse arguments from `download_and_preprocess_data.py` - complete data labelling using lichess API calls, with a workaround or retry request if API rate limiting occurs - write unit tests for scripts that perform feature extraction and data labelling -- write unit tests for `PlayerAnomalyDetectionModel` class and methods (in-progress) \ No newline at end of file +- write unit tests for `PlayerAnomalyDetectionModel` class and methods (in-progress) +- possible benchmarks for length of time to execute data downloading, preprocessing, and model training depending +on the size of the raw data \ No newline at end of file diff --git a/download_and_preprocess.py b/download_and_preprocess.py new file mode 100644 index 0000000..88b3396 --- /dev/null +++ b/download_and_preprocess.py @@ -0,0 +1,112 @@ +import argparse +import os +import re +from pathlib import Path +import subprocess +import zstandard as zstd +import pyzstd + +from enums import Folders + + +def download_data(year, month, filetype): + month = str(month).zfill(2) + url = f"https://database.lichess.org/standard/lichess_db_standard_rated_{year}-{month}.pgn.zst" + filename = f"lichess_db_standard_rated_{year}-{month}.pgn.zst" + if not os.path.exists(Folders.LICHESS_DOWNLOADED_GAMES.value): + os.mkdir(Folders.LICHESS_DOWNLOADED_GAMES.value) + + # Check file size before downloading + response = subprocess.run( + ["wget", "--spider", "--server-response", url], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + # print(f"response: {response}") + result = re.search("Content-Length: (.*)\n", response.stderr.decode()) + content_length = result.group(1) if result else None + download_file_size = int(content_length) if content_length.isdigit() else None + + # Warn user if file size exceeds 1 GB + if download_file_size and download_file_size > 10**9: + user_response = input( + "Warning: File size exceeds 1GB. Do you want to proceed with the download? (Y/N): " + ) + if user_response.lower() != "y": + print("Download aborted.") + return None + else: + subprocess.run(["wget", url, "-P", Folders.LICHESS_DOWNLOADED_GAMES.value]) + else: + subprocess.run(["wget", url, "-P", Folders.LICHESS_DOWNLOADED_GAMES.value]) + + return filename + + +def preprocess_data(filename, remove_raw_files): + """This function calls parse_pgn.py and make_player_features.py with the filename argument""" + + base_filename = Path(filename).stem.split(".")[ + 0 + ] ## removes .pgn.zst from extension + PGN_FILE_PATH = f"{Folders.LICHESS_DOWNLOADED_GAMES.value}/{base_filename}.pgn" + ZST_FILE_PATH = f"{Folders.LICHESS_DOWNLOADED_GAMES.value}/{base_filename}.pgn.zst" + + # decompress .pgn.zst and save as .pgn + # note: for large files, this operation must be done in chunks + # with open(ZST_FILE_PATH, "rb") as compressed_file: + # dctx = zstd.ZstdDecompressor() + # with open(PGN_FILE_PATH, "wb") as output_file: + # for chunk in dctx.read_to_iter(compressed_file): + # output_file.write(chunk) + + # with open(ZST_FILE_PATH, 'rb') as compressed_file: + # decompressor = zstd.ZstdDecompressor() + # with decompressor.stream_reader(compressed_file) as reader: + # with open(PGN_FILE_PATH, 'wb') as output_file: + # for chunk in reader: + # output_file.write(chunk) + + with open(ZST_FILE_PATH, "rb") as f_in: + compressed_data = f_in.read() + + decompressed_data = pyzstd.decompress(compressed_data) + + with open(PGN_FILE_PATH, "wb") as f_out: + f_out.write(decompressed_data) + + subprocess.run(["python3", "parse_pgn.py", PGN_FILE_PATH]) + + CSV_FILE_PATH = f"{Folders.LICHESS_PLAYER_DATA.value}/{base_filename}.csv" + # subprocess.run(["python3", "make_player_features.py", filename[:-4]]) + + # Remove the downloaded .pgn.zst and .pgn files + if remove_raw_files: + os.remove(PGN_FILE_PATH) + os.remove(ZST_FILE_PATH) + + +def main(): + parser = argparse.ArgumentParser(description="Download and preprocess Lichess data") + parser.add_argument("--year", type=int, help="Year of the data", required=True) + parser.add_argument("--month", type=int, help="Month of the data", required=True) + parser.add_argument( + "--filetype", + type=str, + help="Type of file to download", + choices=["lichess-open-database"], + required=True, + ) + parser.add_argument( + "--remove-raw-files", + action="store_true", + help="Remove raw files after preprocessing", + ) + args = parser.parse_args() + + filename = download_data(args.year, args.month, args.filetype) + preprocess_data(filename, args.remove_raw_files) + + +if __name__ == "__main__": + main() diff --git a/enums.py b/enums.py index b537f90..72f964e 100644 --- a/enums.py +++ b/enums.py @@ -15,6 +15,8 @@ class TimeControl(Enum): class Folders(Enum): """Enum to represent the default folder name(s) in the project.""" + LICHESS_DOWNLOADED_GAMES = "lichess_downloaded_games" + LICHESS_PLAYER_DATA = "lichess_player_data" MODEL_PLOTS = "model_plots" SAVED_MODELS = "saved_models" EXPLORATORY_PLOTS = "exploratory_plots" diff --git a/exploratory_plots.py b/exploratory_plots.py index 0e8c406..34bdae8 100644 --- a/exploratory_plots.py +++ b/exploratory_plots.py @@ -6,7 +6,6 @@ ## constants could eventually go into enums BASE_FILE_NAME = "lichess_db_standard_rated_2015-01" - if not os.path.exists(Folders.EXPLORATORY_PLOTS.value): os.mkdir(Folders.EXPLORATORY_PLOTS.value) diff --git a/make_player_features.py b/make_player_features.py index a140a7e..d9f9b15 100644 --- a/make_player_features.py +++ b/make_player_features.py @@ -43,9 +43,7 @@ def get_player_expected_score( player_rating, opponent_rating, player_rd=80.0, opponent_rd=80.0 ): """Returns expected score of player based on player rating, opponent rating, and RDs (if known).""" - A = g(np.sqrt(player_rd**2 + opponent_rd**2)) * ( - player_rating - opponent_rating - ) + A = g(np.sqrt(player_rd**2 + opponent_rd**2)) * (player_rating - opponent_rating) return 1 / (1 + np.exp(-A)) diff --git a/parse_pgn.py b/parse_pgn.py index b1eb49b..972da86 100644 --- a/parse_pgn.py +++ b/parse_pgn.py @@ -1,16 +1,11 @@ +import argparse import os import pandas as pd import chess.pgn -from enums import TimeControl +import zstandard as zstd +from enums import TimeControl, Folders +from pathlib import Path -BASE_FILE_NAME = "lichess_db_standard_rated_2015-01" -PGN_FILE_PATH = f"lichess-games-database/{BASE_FILE_NAME}.pgn" -LICHESS_PLAYER_DATA_FOLDER = "lichess_player_data" - -if not os.path.exists(LICHESS_PLAYER_DATA_FOLDER): - os.mkdir(LICHESS_PLAYER_DATA_FOLDER) - -pgn = open(PGN_FILE_PATH) all_player_info = {} # dictionary storing player info in the following format: @@ -73,115 +68,136 @@ def update_all_player_info( all_player_info[(player, time_control)]["increments"].append(is_increment) -number_of_games_parsed = 0 -while True: - game = chess.pgn.read_game(pgn) - if game is None: - print(f"{number_of_games_parsed} [valid] games parsed.") - break - - headers = game.headers - - # get time control - event = headers["Event"] - if TimeControl.BULLET.value in event.lower(): - time_control = TimeControl.BULLET.value - elif TimeControl.BLITZ.value in event.lower(): - time_control = TimeControl.BLITZ.value - elif TimeControl.RAPID.value in event.lower(): - time_control = TimeControl.RAPID.value - elif TimeControl.CLASSICAL.value in event.lower(): - time_control = TimeControl.CLASSICAL.value - else: - time_control = TimeControl.OTHER.value +def parse_pgn(PGN_FILE_PATH): + print(f"Parsing {PGN_FILE_PATH}...") + + if not os.path.exists(Folders.LICHESS_PLAYER_DATA.value): + os.mkdir(Folders.LICHESS_PLAYER_DATA.value) + + pgn = open(PGN_FILE_PATH) + + # parse the pgn file, and extract information from each game + number_of_games_parsed = 0 + while True: + game = chess.pgn.read_game(pgn) + if game is None: + print(f"{number_of_games_parsed} [valid] games parsed.") + break + + headers = game.headers + + # get time control + event = headers["Event"] + if TimeControl.BULLET.value in event.lower(): + time_control = TimeControl.BULLET.value + elif TimeControl.BLITZ.value in event.lower(): + time_control = TimeControl.BLITZ.value + elif TimeControl.RAPID.value in event.lower(): + time_control = TimeControl.RAPID.value + elif TimeControl.CLASSICAL.value in event.lower(): + time_control = TimeControl.CLASSICAL.value + else: + time_control = TimeControl.OTHER.value + + # get info for both players + white_player, black_player = headers.get("White"), headers.get("Black") + white_rating, black_rating = headers.get("WhiteElo"), headers.get("BlackElo") + white_gain, black_gain = headers.get("WhiteRatingDiff"), headers.get( + "BlackRatingDiff" + ) + increment = headers["TimeControl"][0] + result = headers["Result"] + + # skip games with unknown players, ratings, rating difference, or result + # if either opponent has not played rated games, their rating is 1500 + # but a rating difference is not calculated because this rating is misleading + # therefore, we will exclude such games + skip_game_condition = ( + ("?" in white_player) + | ("?" in black_player) + | (white_player is None) + | (black_player is None) + | ("?" in str(white_rating)) + | ("?" in str(black_rating)) + | (white_gain is None) + | (black_gain is None) + | (result not in ["1-0", "0-1", "1/2-1/2"]) + ) + if skip_game_condition: + continue + else: + white_score = 1 if result == "1-0" else 0.5 if result == "1/2-1/2" else 0 + black_score = 0 if result == "1-0" else 0.5 if result == "1/2-1/2" else 1 + + ## only convert rating and rating gain to a number once we know it's not None + white_rating = float(white_rating) + black_rating = float(black_rating) + white_gain = float(white_gain) + black_gain = float(black_gain) + + is_increment = 0 if increment == "0" else 1 + + # update white player info + update_all_player_info( + player=white_player, + time_control=time_control, + current_rating=white_rating, + opponent_rating=black_rating, + score=white_score, + rating_gain=white_gain, + is_increment=is_increment, + ) + + # update black player info + update_all_player_info( + player=black_player, + time_control=time_control, + current_rating=black_rating, + opponent_rating=white_rating, + score=black_score, + rating_gain=black_gain, + is_increment=is_increment, + ) + + number_of_games_parsed += 1 + if number_of_games_parsed % 10000 == 0: + print(f"{number_of_games_parsed} games parsed...") + + # convert to pandas DataFrame + all_player_df = pd.DataFrame.from_dict( + all_player_info, + orient="index", + columns=[ + "ratings", + "opponent_ratings", + "actual_scores", + "rating_gains", + "increments", + ], + ) - # get info for both players - white_player, black_player = headers.get("White"), headers.get("Black") - white_rating, black_rating = headers.get("WhiteElo"), headers.get("BlackElo") - white_gain, black_gain = headers.get("WhiteRatingDiff"), headers.get( - "BlackRatingDiff" + # explode all_player_df to each row corresponds to one game + all_player_games_exploded = all_player_df.explode( + column=[ + "ratings", + "opponent_ratings", + "actual_scores", + "rating_gains", + "increments", + ] ) - increment = headers["TimeControl"][0] - result = headers["Result"] - - # skip games with unknown players, ratings, rating difference, or result - # if either opponent has not played rated games, their rating is 1500 - # but a rating difference is not calculated because this rating is misleading - # therefore, we will exclude such games - skip_game_condition = ( - ("?" in white_player) - | ("?" in black_player) - | (white_player is None) - | (black_player is None) - | ("?" in str(white_rating)) - | ("?" in str(black_rating)) - | (white_gain is None) - | (black_gain is None) - | (result not in ["1-0", "0-1", "1/2-1/2"]) + + # save to csv + base_filename = Path(PGN_FILE_PATH).stem.split(".")[0] + all_player_games_exploded.to_csv( + f"{Folders.LICHESS_PLAYER_DATA.value}/{base_filename}.csv" ) - if skip_game_condition: - continue - else: - white_score = 1 if result == "1-0" else 0.5 if result == "1/2-1/2" else 0 - black_score = 0 if result == "1-0" else 0.5 if result == "1/2-1/2" else 1 - - ## only convert rating and rating gain to a number once we know it's not None - white_rating = float(white_rating) - black_rating = float(black_rating) - white_gain = float(white_gain) - black_gain = float(black_gain) - - is_increment = 0 if increment == "0" else 1 - - # update white player info - update_all_player_info( - player=white_player, - time_control=time_control, - current_rating=white_rating, - opponent_rating=black_rating, - score=white_score, - rating_gain=white_gain, - is_increment=is_increment, - ) - # update black player info - update_all_player_info( - player=black_player, - time_control=time_control, - current_rating=black_rating, - opponent_rating=white_rating, - score=black_score, - rating_gain=black_gain, - is_increment=is_increment, - ) - number_of_games_parsed += 1 - if number_of_games_parsed % 10000 == 0: - print(f"{number_of_games_parsed} games parsed...") - -# convert to pandas DataFrame -all_player_df = pd.DataFrame.from_dict( - all_player_info, - orient="index", - columns=[ - "ratings", - "opponent_ratings", - "actual_scores", - "rating_gains", - "increments", - ], -) - -# explode all_player_df to each row corresponds to one game -all_player_games_exploded = all_player_df.explode( - column=[ - "ratings", - "opponent_ratings", - "actual_scores", - "rating_gains", - "increments", - ] -) - -# save to csv -all_player_games_exploded.to_csv(f"{LICHESS_PLAYER_DATA_FOLDER}/{BASE_FILE_NAME}.csv") +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Parse PGN file") + parser.add_argument("PGN_FILE_PATH", type=str, help="Path to the PGN file") + args = parser.parse_args() + + ## parse PGN file + parse_pgn(args.PGN_FILE_PATH) diff --git a/requirements.txt b/requirements.txt index ef99429..a9a9a8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,6 @@ python-lichess==0.10 # Client for lichess.org API mock==5.1.0 pylint==3.0.3 pytest==7.0.1 +pyzstd==0.15.9 debugpy # Required for debugging.