call make_exploratory_plots from download_and_preprocess.py

merillium · Feb 24, 2024 · 8a795dd · 8a795dd
1 parent 12e5db0
commit 8a795dd
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 116 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # filter_suspicious_players
 
-This is a work-in-progress package that retrieves training data from the [lichess.org open database](https://database.lichess.org/), then trains a statistical model to detect suspicious players. Currently the app is not functional, and has not yet been built.
+This is a work-in-progress package that retrieves training data from the [lichess.org open database](https://database.lichess.org/), then trains a statistical model to detect suspicious players. Currently the app is still in development.
 
 ### Data Download and Preprocessing
 To download and preprocess data from the lichess.org open database, you can run the following command (specifying the year and month of the data you want to download, and the source of the data to be `lichess-open-database`):
@@ -9,7 +9,7 @@ To download and preprocess data from the lichess.org open database, you can run
 python3 download_and_preprocess.py --year 2015 --month 1 --source lichess-open-database
 ```
 
-The `download_and_preprocess.py` script downloads the `.pgn.zst` file corresponding to the month and year specified, decompresses the `.pgn` file, and creates the `lichess_downloaded_games` directory to which both files are saved. Then the script preprocesses the `.pgn` file and extracts relevant features, creates the `lichess_player_data` directory, to which a `.csv` file is saved. By default, all raw files in the `lichess_downloaded_games` directory are then deleted because they are typically large and not needed after preprocessing. (This process can be streamlined by directly reading from the decompressed `.pgn` file instead of first saving it)
+The `download_and_preprocess.py` script downloads the `.pgn.zst` file corresponding to the month and year specified, decompresses the `.pgn` file, and creates the `lichess_downloaded_games` directory to which both files are saved. Then the script preprocesses the `.pgn` file and extracts relevant features, creates the `lichess_player_data` directory, to which a `.csv` file is saved. By default, exploratory plots are generated, and then all raw files in the `lichess_downloaded_games` directory are deleted because they are typically large and not needed after preprocessing. (This process can be streamlined by directly reading from the decompressed `.pgn` file instead of first saving it)
 
 ### Model Description
 This is a simple statistical model that flags players who have performed a certain threshold above their expected performance under the Glicko-2 rating system. The expected performance takes into account each player's complete game history and opponents in the span of the training data. The thresholds are initialized to default values, and then adjusted separately for each 100 point rating bin in the training data.

diff --git a/download_and_preprocess.py b/download_and_preprocess.py
@@ -73,8 +73,17 @@ def preprocess_data(filename, remove_raw_files):
     )
     subprocess.run(["python3", "make_player_features.py", CSV_RAW_FEATURES_FILE_PATH])
 
+    ## make exploratory plots
+    CSV_PLAYER_FEATURE_FILE_PATH = (
+        f"{Folders.LICHESS_PLAYER_DATA.value}/{BASE_FILE_NAME}_player_features.csv"
+    )
+    subprocess.run(
+        ["python3", "make_exploratory_plots.py", CSV_PLAYER_FEATURE_FILE_PATH]
+    )
+
     # Remove the downloaded .pgn.zst and .pgn files
     if remove_raw_files:
+        print("Cleaning up downloaded files...")
         os.remove(PGN_FILE_PATH)
         os.remove(ZST_FILE_PATH)
 
@@ -90,6 +99,11 @@ def main():
         choices=["lichess-open-database"],
         required=True,
     )
+    parser.add_argument(
+        "--generate-exploratory-plots",
+        action="store_true",
+        help="Generate exploratory plots",
+    )
     parser.add_argument(
         "--remove-raw-files",
         action="store_true",

diff --git a/exploratory_plots.py b/exploratory_plots.py
diff --git a/make_exploratory_plots.py b/make_exploratory_plots.py
@@ -0,0 +1,132 @@
+import argparse
+import os
+from pathlib import Path
+import pandas as pd
+import plotly.graph_objects as go
+from enums import Folders
+
+
+def make_exploratory_plots(CSV_PLAYER_FEATURE_FILE_PATH):
+    if not os.path.exists(Folders.EXPLORATORY_PLOTS.value):
+        os.mkdir(Folders.EXPLORATORY_PLOTS.value)
+
+    BASE_FILE_NAME = Path(CSV_PLAYER_FEATURE_FILE_PATH).stem.split(".")[0]
+
+    ## load the player features dataframe
+    all_player_features = pd.read_csv(CSV_PLAYER_FEATURE_FILE_PATH)
+    all_player_features = all_player_features[
+        all_player_features["time_control"].isin(["bullet", "blitz", "classical"])
+    ]
+
+    ## plot the distribution of mean rating gain for each rating bin
+    for time_group, time_group_df in all_player_features.groupby("time_control"):
+        fig = go.Figure()
+        for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
+            rating_bin_str = f"{rating_bin}-{rating_bin+100}"
+            fig.add_trace(
+                go.Violin(
+                    x=rating_group["mean_rating_gain"].values,
+                    name=rating_bin_str,
+                    box_visible=False,
+                    meanline_visible=False,
+                    opacity=0.5,
+                ),
+            )
+
+        ## the side='positive' argument to update_traces method
+        ## is only valid for a figure containing only go.Violin plots,
+        ## so we have to update_layout before we add any of the go.Scatter traces
+
+        fig.update_traces(orientation="h", side="positive", width=3, points=False)
+        fig.update_layout(
+            title=f"{time_group.capitalize()} Rating Changes by Rating Bin",
+            xaxis_title="Mean Rating Change",
+            yaxis_title="Rating Bin",
+            xaxis_showgrid=False,
+            xaxis_zeroline=False,
+        )
+
+        ## add markers to indicate the mean rating gain for each rating bin
+        for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
+            rating_bin_str = f"{rating_bin}-{rating_bin+100}"
+            fig.add_trace(
+                go.Scatter(
+                    x=[rating_group["mean_rating_gain"].mean()],
+                    y=[rating_bin_str],
+                    mode="markers",
+                    showlegend=False,
+                    marker={"color": "black", "size": 5},
+                    marker_symbol="diamond",
+                )
+            )
+
+        fig.add_vline(
+            x=0, line_dash="dash", line_color="blue", line_width=2, opacity=0.5
+        )
+        fig.write_html(
+            f"{Folders.EXPLORATORY_PLOTS.value}/{BASE_FILE_NAME}_{time_group}_rating_gain.html"
+        )
+
+    ## plot distribution of mean_perf_diff
+    for time_group, time_group_df in all_player_features.groupby("time_control"):
+        fig = go.Figure()
+        for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
+            rating_bin_str = f"{rating_bin}-{rating_bin+100}"
+            fig.add_trace(
+                go.Violin(
+                    x=rating_group["mean_perf_diff"].values,
+                    name=rating_bin_str,
+                    box_visible=False,
+                    meanline_visible=False,
+                    opacity=0.5,
+                ),
+            )
+
+        ## the side='positive' argument to update_traces method
+        ## is only valid for a figure containing only go.Violin plots,
+        ## so we have to update_layout before we add any of the go.Scatter traces
+
+        ## add markers to indicate the mean rating gain for each rating bin
+        fig.update_traces(orientation="h", side="positive", width=3, points=False)
+        fig.update_layout(
+            title=f"{time_group.capitalize()} Performance Difference by Rating Bin",
+            xaxis_title="Mean Performance Difference",
+            yaxis_title="Rating Bin",
+            xaxis_range=[-1.00, 1.00],
+            xaxis_showgrid=False,
+            xaxis_zeroline=False,
+        )
+
+        for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
+            rating_bin_str = f"{rating_bin}-{rating_bin+100}"
+            fig.add_trace(
+                go.Scatter(
+                    x=[rating_group["mean_perf_diff"].mean()],
+                    y=[rating_bin_str],
+                    mode="markers",
+                    showlegend=False,
+                    marker={"color": "black", "size": 5},
+                    marker_symbol="diamond",
+                )
+            )
+
+        fig.add_vline(
+            x=0.0, line_dash="dash", line_color="blue", line_width=2, opacity=0.5
+        )
+
+        fig.write_html(
+            f"{Folders.EXPLORATORY_PLOTS.value}/{BASE_FILE_NAME}_{time_group}_perf_diff.html"
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Create exploratory plots")
+    parser.add_argument(
+        "CSV_PLAYER_FEATURE_FILE_PATH",
+        type=str,
+        help="Path to the player features CSV file",
+    )
+    args = parser.parse_args()
+
+    ## create features from the CSV file
+    make_exploratory_plots(args.CSV_PLAYER_FEATURE_FILE_PATH)