code cleanup

merillium · Feb 4, 2024 · 60d9241 · 60d9241
1 parent 77058f8
commit 60d9241
Show file tree

Hide file tree

Showing 9 changed files with 40 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Currently the app is not functional, and has not been deployed. If cloning this
 This is a simple statistical model that flags players who have performed a certain threshold above their expected performance under the Glicko-2 rating system. The expected performance takes into account all player's complete game history and opponents in the span of the training data. The thresholds are initialized to default values, but are then adjusted separately for each 100 point rating bin in the training data.
 
 ### Model Training
-We define `N` as the number of players who have performed above some threshold, and the estimated number of cheaters as `X = 0.00 * N_open + 0.75 * N_closed + 1.00 * N_violation` where `N_open` is the number of players with open accounts, `N_closed` is the number of players with closed accounts, and `N_violation` is the number of players with a terms of service violation (where `N = N_open + N_closed + N_violation`), the metric used to evaluate the performance of the threshold is the `log(N+1) * X / N`. This a simple metric intended to reward the model for `high accuracy = X / N`` in detecting suspicious players without flagging too many players (observationally, if the threshold is too low, the accuracy will decrease faster than log(N)). Note that for a threshold that is too high and flags 0 players, the metric will be 0. This metric may be fine-tuned in the future, but is sufficient for a POC.
+We define `N` as the number of players who have performed above some threshold, and the estimated number of cheaters as `X = 0.00 * N_open + 0.75 * N_closed + 1.00 * N_violation` where `N_open` is the number of players with open accounts, `N_closed` is the number of players with closed accounts, and `N_violation` is the number of players with a terms of service violation (where `N = N_open + N_closed + N_violation`), the metric used to evaluate the performance of the threshold is the `log(N+1) * X / N`. This is a simple metric intended to reward the model for `high accuracy = X / N` in detecting suspicious players without flagging too many players (observationally, if the threshold is too low, the accuracy will decrease faster than log(N)). Note that for a threshold that is too high and flags 0 players, the metric will be 0. This metric may be fine-tuned in the future, but is sufficient for a POC.
 
 Below is an example of the threshold vs accuracy plot below for players in the 1200-1300 range based on training data from the month of Jan 2015.
 
@@ -20,6 +20,7 @@ The model is built on the assumption that cheating is a rare occurrence in any d
 ### Sample code:
 ```python
 import pandas as pd
+
 from player_account_handler import PlayerAccountHandler
 from model import PlayerAnomalyDetectionModel
 BASE_FILE_NAME = 'lichess_db_standard_rated_2015-01'

diff --git a/enums.py b/enums.py
@@ -2,14 +2,19 @@
 
 
 class TimeControl(Enum):
+    """Enum to represent the time control of a chess game."""
+
     BULLET = "bullet"
     BLITZ = "blitz"
     RAPID = "rapid"
     CLASSICAL = "classical"
+    OTHER = "other"
     ALL = ["bullet", "blitz", "rapid", "classical"]
 
 
 class Folders(Enum):
+    """Enum to represent the default folder name(s) in the project."""
+
     MODEL_PLOTS = "model_plots"
     SAVED_MODELS = "saved_models"
     EXPLORATORY_PLOTS = "exploratory_plots"
diff --git a/exploratory_plots.py b/exploratory_plots.py
@@ -1,9 +1,6 @@
 import os
-import numpy as np
 import pandas as pd
-import plotly.express as px
 import plotly.graph_objects as go
-from plotly.subplots import make_subplots
 from enums import Folders
 
 ## constants could eventually go into enums
@@ -58,7 +55,7 @@
                 y=[rating_bin_str],
                 mode="markers",
                 showlegend=False,
-                marker=dict(color="black", size=5),
+                marker={"color": "black", "size": 5},
                 marker_symbol="diamond",
             )
         )
@@ -107,7 +104,7 @@
                 y=[rating_bin_str],
                 mode="markers",
                 showlegend=False,
-                marker=dict(color="black", size=5),
+                marker={"color": "black", "size": 5},
                 marker_symbol="diamond",
             )
         )

diff --git a/make_player_features.py b/make_player_features.py
@@ -1,8 +1,5 @@
-import lichess.api
 import numpy as np
 import pandas as pd
-from bs4 import BeautifulSoup
-from tqdm import tqdm
 
 BASE_FILE_NAME = "lichess_db_standard_rated_2015-01"
 
@@ -31,11 +28,11 @@
 ## and this comment left by @chess_in_sgv:
 
 # Let P2 = Expected outcome for player 2. Then:
-# P2 = 1 / (1 + e-A)
-# with A = g(sqrt(r12+r22)) * (s2-s1))
+# P2 = 1 / (1 + e^-a)
+# with a = g(sqrt(r12+r22)) * (s2-s1))
 # and g(x) = 1/sqrt(1+3x2/pi2)
 
-##  source: https://www.reddit.com/r/chess/comments/i0pnv1/comment/fzrhhwi/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
+##  source: https://www.reddit.com/r/chess/comments/i0pnv1/comment/fzrhhwi
 
 
 def g(x):
@@ -46,7 +43,9 @@ def get_player_expected_score(
     player_rating, opponent_rating, player_rd=80.0, opponent_rd=80.0
 ):
     """Returns expected score of player based on player rating, opponent rating, and RDs (if known)."""
-    A = g(np.sqrt(player_rd**2 + opponent_rd**2)) * (player_rating - opponent_rating)
+    A = g(np.sqrt(player_rd**2 + opponent_rd**2)) * (
+        player_rating - opponent_rating
+    )
     return 1 / (1 + np.exp(-A))
 
 
@@ -86,7 +85,7 @@ def get_player_expected_score(
     proportion_increment_games=("increments", "mean"),
 )
 
-## some [potentially] useful red flags for suspicious behavior:
+## some useful red flags for suspicious behavior:
 # (1) consistently performing above expectation
 # (i.e. mean performance difference far from 0.00 with low standard deviation performance difference)
 # we may refine this to drop the low standard deviation performance difference condition
@@ -111,7 +110,6 @@ def get_player_expected_score(
 all_player_features["rating_bin"] = pd.cut(
     all_player_features["mean_rating"], rating_bins, right=True, labels=rating_bins[:-1]
 ).astype(int)
-# all_player_features['rating_bin_label'] = pd.cut(all_player_features['mean_rating'], rating_bins, right=True, labels=rating_bin_labels)
 
 ## save to csv
 all_player_features.to_csv(f"lichess_player_data/{BASE_FILE_NAME}_player_features.csv")
diff --git a/model.py b/model.py
@@ -30,7 +30,7 @@ def __init__(self, player_account_handler):
             for time_control in TimeControl.ALL.value
         }
         self._player_account_handler = player_account_handler
-        self._ACCOUNT_STATUS_SCORE_MAP = {
+        self._account_status_score_map = {
             "open": 0,
             "tosViolation": 1,
             "closed": 0.75,  # weight closed account as closer to a tosViolation
@@ -43,13 +43,12 @@ def load_model(self, model_file_name: str):
         pass
 
     def fit(self, train_data: pd.DataFrame, generate_plots=True):
-        if self.is_fitted:
-            pass
-            # issue a warning that the user is retraining the model!
-            # give the user the option to combine multiple training data sets
-        else:
+        if not self.is_fitted:
             self._set_thresholds(train_data, generate_plots)
             self.is_fitted = True
+        else:
+            print("Warning: model is already fitted")
+            pass
 
     def _set_thresholds(self, train_data, generate_plots):
         ## set thresholds by each rating bin, also updates player account statuses
@@ -78,7 +77,6 @@ def _set_thresholds(self, train_data, generate_plots):
             train_number_of_flagged_players = []
 
             while True:
-
                 all_flagged_players = train_rating_bin_df[
                     train_rating_bin_df["mean_perf_diff"] > train_threshold
                 ]["player"].tolist()
@@ -107,7 +105,7 @@ def _set_thresholds(self, train_data, generate_plots):
 
                 ## get the score for each player
                 train_scores = [
-                    self._ACCOUNT_STATUS_SCORE_MAP.get(status)
+                    self._account_status_score_map.get(status)
                     for status in train_predictions
                 ]
 

diff --git a/model_plots.py b/model_plots.py
@@ -4,15 +4,17 @@
 
 
 def generate_model_threshold_plots(
-    BASE_FILE_NAME,
-    MODEL_PLOTS_FOLDER,
+    base_file_name,
+    model_plots_folder,
     train_threshold_list,
     train_accuracy_list,
     train_number_of_flagged_players,
     best_threshold,
     time_control,
     rating_bin_key,
 ):
+    """Generate model threshold plots showing accuracy and number of players vs model threshold(s)."""
+
     fig = make_subplots(specs=[[{"secondary_y": True}]])
     fig.add_trace(
         go.Scatter(
@@ -42,8 +44,8 @@ def generate_model_threshold_plots(
         yaxis2_title="Number of Flagged Players",
         yaxis_range=[0, 1],
     )
-    if not os.path.exists(MODEL_PLOTS_FOLDER):
-        os.mkdir(MODEL_PLOTS_FOLDER)
+    if not os.path.exists(model_plots_folder):
+        os.mkdir(model_plots_folder)
     fig.write_html(
-        f"{MODEL_PLOTS_FOLDER}/{BASE_FILE_NAME}_model_thresholds_{time_control}_{rating_bin_key}.html"
+        f"{model_plots_folder}/{base_file_name}_model_thresholds_{time_control}_{rating_bin_key}.html"
     )
diff --git a/parse_pgn.py b/parse_pgn.py
@@ -1,7 +1,6 @@
 import os
 import pandas as pd
 import chess.pgn
-
 from enums import TimeControl
 
 BASE_FILE_NAME = "lichess_db_standard_rated_2015-01"
@@ -56,7 +55,7 @@ def update_all_player_info(
         }
 
     # exclude a rating of 1500.0 exactly as this could be a first game
-    # refine analysis by excluding the first N0 = 10 games if the first rating is 1500.0
+    # refine analysis by excluding the first N_0 = 10 games if the first rating is 1500.0
     elif (all_player_info.get((player, time_control)) is None) & (
         current_rating == 1500.0
     ):
@@ -85,16 +84,16 @@ def update_all_player_info(
 
     # get time control
     event = headers["Event"]
-    if "bullet" in event.lower():
-        time_control = "bullet"
-    elif "blitz" in event.lower():
-        time_control = "blitz"
-    elif "rapid" in event.lower():
-        time_control = "rapid"
-    elif "classical" in event.lower():
-        time_control = "classical"
+    if TimeControl.BULLET.value in event.lower():
+        time_control = TimeControl.BULLET.value
+    elif TimeControl.BLITZ.value in event.lower():
+        time_control = TimeControl.BLITZ.value
+    elif TimeControl.RAPID.value in event.lower():
+        time_control = TimeControl.RAPID.value
+    elif TimeControl.CLASSICAL.value in event.lower():
+        time_control = TimeControl.CLASSICAL.value
     else:
-        time_control = "other"
+        time_control = TimeControl.OTHER.value
 
     # get info for both players
     white_player, black_player = headers.get("White"), headers.get("Black")

diff --git a/player_account_handler.py b/player_account_handler.py
@@ -16,9 +16,7 @@ def __init__(self):
     'open' = account in good standing
     'not found' = account does not exist (this should not happen)
 
-    Note:
-
-    From lichess api documentation:
+    From the lichess api documentation:
     All requests are rate limited using various strategies,
     to ensure the API remains responsive for everyone.
     Only make one request at a time. If you receive an HTTP response with a 429 status,

diff --git a/tests/test_model.py b/tests/test_model.py
@@ -40,7 +40,6 @@ def get_sample_train_data():
 
 @pytest.mark.usefixtures("get_sample_train_data", "build_training_data")
 class TestPlayerAnomalyDetectionModel(unittest.TestCase):
-
     @mock.patch(
         "player_account_handler.PlayerAccountHandler.update_player_account_status"
     )
@@ -53,7 +52,6 @@ def build_training_data(self, get_sample_train_data):
         self.sample_train_data = get_sample_train_data
 
     def test_fit(self):
-
         ## this is a workaround to avoid calling get_player_account_status
         self.model._player_account_handler._account_statuses = {
             "test_player1": "open",