refactored PlayerAnomalyDetectionModel and PlayerAccountHandler class…

…, updated unit tests
merillium · Feb 4, 2024 · 77058f8 · 77058f8
1 parent c6b6e0c
commit 77058f8
Show file tree

Hide file tree

Showing 15 changed files with 465 additions and 342 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 .*
+**/__pycache__/
 !/.gitignore
-__pycache__
 lichess_player_data/
 lichess-games-database/
 exploratory_plots/

diff --git a/Makefile b/Makefile
@@ -0,0 +1,3 @@
+.PHONY: test
+test:
+	PYTHONPATH=. pytest
diff --git a/README.md b/README.md
@@ -19,17 +19,21 @@ The model is built on the assumption that cheating is a rare occurrence in any d
 
 ### Sample code:
 ```python
+import pandas as pd
+from player_account_handler import PlayerAccountHandler
+from model import PlayerAnomalyDetectionModel
 BASE_FILE_NAME = 'lichess_db_standard_rated_2015-01'
 train_data = pd.read_csv(f'lichess_player_data/{BASE_FILE_NAME}_player_features.csv')
-model = PlayerAnomalyDetectionModel()
+player_account_handler = PlayerAccountHandler()
+model = PlayerAnomalyDetectionModel(player_account_handler)
 model.fit(train_data)
 model.save_model(f'{BASE_FILE_NAME}_model')
 predictions = model.predict(train_data)
 ```
 
 ### Unit Tests
-Currently working on the following unit test(s) which can be run with the following command:
-```pytest test_model.py```
+Currently working on unit tests, which can be run with the following command:
+```make test```, or if you want to run test files individually ```PYTHONPATH=. pytest tests/test_model.py```
 
 To-do:
 - write a bash script to download and unzip data from the lichess.org open database

diff --git a/app.py b/app.py
@@ -1,6 +1,7 @@
 """
 A sample Hello World server.
 """
+
 import os
 
 from flask import Flask, render_template
@@ -11,20 +12,20 @@
 app = Flask(__name__)
 
 
-@app.route('/')
+@app.route("/")
 def hello():
     """Return a friendly HTTP greeting."""
     message = "It's running!"
 
     """Get Cloud Run environment variables."""
-    service = os.environ.get('K_SERVICE', 'Unknown service')
-    revision = os.environ.get('K_REVISION', 'Unknown revision')
+    service = os.environ.get("K_SERVICE", "Unknown service")
+    revision = os.environ.get("K_REVISION", "Unknown revision")
+
+    return render_template(
+        "index.html", message=message, Service=service, Revision=revision
+    )
 
-    return render_template('index.html',
-        message=message,
-        Service=service,
-        Revision=revision)
 
-if __name__ == '__main__':
-    server_port = os.environ.get('PORT', '8080')
-    app.run(debug=False, port=server_port, host='0.0.0.0')
+if __name__ == "__main__":
+    server_port = os.environ.get("PORT", "8080")
+    app.run(debug=False, port=server_port, host="0.0.0.0")
diff --git a/enums.py b/enums.py
@@ -1,12 +1,15 @@
 from enum import Enum
 
+
 class TimeControl(Enum):
-    BULLET = 'bullet'
-    BLITZ = 'blitz'
-    RAPID = 'rapid'
-    CLASSICAL = 'classical'
-    ALL = ['bullet', 'blitz', 'rapid', 'classical']
+    BULLET = "bullet"
+    BLITZ = "blitz"
+    RAPID = "rapid"
+    CLASSICAL = "classical"
+    ALL = ["bullet", "blitz", "rapid", "classical"]
+
 
 class Folders(Enum):
-    MODEL_PLOTS = 'model_plots'
-    SAVED_MODELS = 'saved_models'
+    MODEL_PLOTS = "model_plots"
+    SAVED_MODELS = "saved_models"
+    EXPLORATORY_PLOTS = "exploratory_plots"
diff --git a/exploratory_plots.py b/exploratory_plots.py
@@ -4,91 +4,115 @@
 import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+from enums import Folders
 
 ## constants could eventually go into enums
-BASE_FILE_NAME = 'lichess_db_standard_rated_2015-01'
-EXPLORATORY_PLOTS_FOLDER = 'exploratory_plots'
+BASE_FILE_NAME = "lichess_db_standard_rated_2015-01"
 
-if not os.path.exists(EXPLORATORY_PLOTS_FOLDER):
-    os.mkdir(EXPLORATORY_PLOTS_FOLDER)
+
+if not os.path.exists(Folders.EXPLORATORY_PLOTS_FOLDER.value):
+    os.mkdir(Folders.EXPLORATORY_PLOTS_FOLDER.value)
 
 ## load the player features dataframe
-all_player_features = pd.read_csv(f'lichess_player_data/{BASE_FILE_NAME}_player_features.csv')
-all_player_features = all_player_features[all_player_features['time_control'].isin(['bullet','blitz','classical'])]
+all_player_features = pd.read_csv(
+    f"lichess_player_data/{BASE_FILE_NAME}_player_features.csv"
+)
+all_player_features = all_player_features[
+    all_player_features["time_control"].isin(["bullet", "blitz", "classical"])
+]
 
 ## plot the distribution of mean rating gain for each rating bin
-for time_group, time_group_df in all_player_features.groupby('time_control'):
+for time_group, time_group_df in all_player_features.groupby("time_control"):
     fig = go.Figure()
-    for rating_bin, rating_group in time_group_df.groupby('rating_bin'):
+    for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
         rating_bin_str = f"{rating_bin}-{rating_bin+100}"
-        fig.add_trace(go.Violin(x=rating_group['mean_rating_gain'].values,
-                                name=rating_bin_str,
-                                box_visible=False,
-                                meanline_visible=False,
-                                opacity=0.5),
-                                )
-
-    ## the side='positive' argument to update_traces method 
-    ## is only valid for a figure containing only go.Violin plots, 
+        fig.add_trace(
+            go.Violin(
+                x=rating_group["mean_rating_gain"].values,
+                name=rating_bin_str,
+                box_visible=False,
+                meanline_visible=False,
+                opacity=0.5,
+            ),
+        )
+
+    ## the side='positive' argument to update_traces method
+    ## is only valid for a figure containing only go.Violin plots,
     ## so we have to update_layout before we add any of the go.Scatter traces
-
-    fig.update_traces(orientation='h', side='positive', width=3, points=False)
-    fig.update_layout(title=f'{time_group.capitalize()} Rating Changes by Rating Bin',
-                      xaxis_title='Mean Rating Change',
-                      yaxis_title='Rating Bin',
-                      xaxis_showgrid=False, xaxis_zeroline=False)
+
+    fig.update_traces(orientation="h", side="positive", width=3, points=False)
+    fig.update_layout(
+        title=f"{time_group.capitalize()} Rating Changes by Rating Bin",
+        xaxis_title="Mean Rating Change",
+        yaxis_title="Rating Bin",
+        xaxis_showgrid=False,
+        xaxis_zeroline=False,
+    )
 
     ## add markers to indicate the mean rating gain for each rating bin
-    for rating_bin, rating_group in time_group_df.groupby('rating_bin'):
+    for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
         rating_bin_str = f"{rating_bin}-{rating_bin+100}"
-        fig.add_trace(go.Scatter(
-                    x=[rating_group['mean_rating_gain'].mean()],
-                    y=[rating_bin_str],
-                    mode='markers',
-                    showlegend=False,
-                    marker=dict(color='black', size=5),
-                    marker_symbol='diamond'
-                ))
-
-    fig.add_vline(x=0, line_dash="dash", line_color='blue', line_width=2, opacity=0.5)
-    fig.write_html(f"exploratory_plots/{BASE_FILE_NAME}_{time_group}_rating_gain.html")
+        fig.add_trace(
+            go.Scatter(
+                x=[rating_group["mean_rating_gain"].mean()],
+                y=[rating_bin_str],
+                mode="markers",
+                showlegend=False,
+                marker=dict(color="black", size=5),
+                marker_symbol="diamond",
+            )
+        )
+
+    fig.add_vline(x=0, line_dash="dash", line_color="blue", line_width=2, opacity=0.5)
+    fig.write_html(
+        f"{Folders.EXPLORATORY_PLOTS_FOLDER.value}/{BASE_FILE_NAME}_{time_group}_rating_gain.html"
+    )
 
 
 ## plot distribution of mean_perf_diff
-for time_group, time_group_df in all_player_features.groupby('time_control'):
+for time_group, time_group_df in all_player_features.groupby("time_control"):
     fig = go.Figure()
-    for rating_bin, rating_group in time_group_df.groupby('rating_bin'):
+    for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
         rating_bin_str = f"{rating_bin}-{rating_bin+100}"
-        fig.add_trace(go.Violin(x=rating_group['mean_perf_diff'].values,
-                                name=rating_bin_str,
-                                box_visible=False,
-                                meanline_visible=False,
-                                opacity=0.5),
-                                )
-
-    ## the side='positive' argument to update_traces method 
-    ## is only valid for a figure containing only go.Violin plots, 
+        fig.add_trace(
+            go.Violin(
+                x=rating_group["mean_perf_diff"].values,
+                name=rating_bin_str,
+                box_visible=False,
+                meanline_visible=False,
+                opacity=0.5,
+            ),
+        )
+
+    ## the side='positive' argument to update_traces method
+    ## is only valid for a figure containing only go.Violin plots,
     ## so we have to update_layout before we add any of the go.Scatter traces
-    
+
     ## add markers to indicate the mean rating gain for each rating bin
-    fig.update_traces(orientation='h', side='positive', width=3, points=False)
-    fig.update_layout(title=f'{time_group.capitalize()} Performance Difference by Rating Bin',
-                      xaxis_title='Mean Performance Difference',
-                      yaxis_title='Rating Bin',
-                      xaxis_range=[-1.00,1.00],
-                      xaxis_showgrid=False, xaxis_zeroline=False)
+    fig.update_traces(orientation="h", side="positive", width=3, points=False)
+    fig.update_layout(
+        title=f"{time_group.capitalize()} Performance Difference by Rating Bin",
+        xaxis_title="Mean Performance Difference",
+        yaxis_title="Rating Bin",
+        xaxis_range=[-1.00, 1.00],
+        xaxis_showgrid=False,
+        xaxis_zeroline=False,
+    )
 
-    for rating_bin, rating_group in time_group_df.groupby('rating_bin'):
+    for rating_bin, rating_group in time_group_df.groupby("rating_bin"):
         rating_bin_str = f"{rating_bin}-{rating_bin+100}"
-        fig.add_trace(go.Scatter(
-                    x=[rating_group['mean_perf_diff'].mean()],
-                    y=[rating_bin_str],
-                    mode='markers',
-                    showlegend=False,
-                    marker=dict(color='black', size=5),
-                    marker_symbol='diamond'
-                ))
-
-    fig.add_vline(x=0.0, line_dash="dash", line_color='blue', line_width=2, opacity=0.5)
-    fig.write_html(f"exploratory_plots/{BASE_FILE_NAME}_{time_group}_perf_diff.html")
+        fig.add_trace(
+            go.Scatter(
+                x=[rating_group["mean_perf_diff"].mean()],
+                y=[rating_bin_str],
+                mode="markers",
+                showlegend=False,
+                marker=dict(color="black", size=5),
+                marker_symbol="diamond",
+            )
+        )
 
+    fig.add_vline(x=0.0, line_dash="dash", line_color="blue", line_width=2, opacity=0.5)
+    fig.write_html(
+        f"{Folders.EXPLORATORY_PLOTS_FOLDER.value}/{BASE_FILE_NAME}_{time_group}_perf_diff.html"
+    )
diff --git a/get_player_labels.py b/get_player_labels.py