RolnickLab · vanessavmac · Sep 16, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/scripts/1_make_index.py b/scripts/1_make_index.py
@@ -0,0 +1,111 @@
+# Walk through all files in the repo_path and output a csv file with all @TODO tags and their context
+import argparse
+import os
+import re
+
+import pandas as pd
+import pathspec
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--repo_path", type=str)
+parser.add_argument("--output_file", default="make_index.csv", type=str)
+
+# Define the number of lines before and after the TODO to include in context
+CONTEXT_LINES = 5
+
+
+def load_gitignore(repo_path):
+    gitignore_path = os.path.join(repo_path, ".gitignore")
+    if os.path.exists(gitignore_path):
+        with open(gitignore_path, encoding="utf-8") as f:
+            patterns = f.read().splitlines()
+        return pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, patterns)
+    return None
+
+
+def is_ignored(file_path, ignore_spec, repo_path):
+    if ignore_spec:
+        relative_path = os.path.relpath(file_path, repo_path)
+        return ignore_spec.match_file(relative_path)
+    return False
+
+
+def find_todos_in_file(file_path):
+    todos = []
+    with open(file_path, encoding="utf-8") as f:
+        lines = f.readlines()
+
+    for line_num, line in enumerate(lines):
+        if "@TODO" in line:
+            # Find function name (if exists)
+            func_name = find_function_name(lines, line_num)
+
+            # Extract context
+            start = max(0, line_num - CONTEXT_LINES)
+            end = min(len(lines), line_num + CONTEXT_LINES + 1)
+            context = "".join(lines[start:end])
+
+            todos.append(
+                {"File": file_path, "Function": func_name, "Line Number": line_num + 1, "Context": context.strip()}
+            )
+    return todos
+
+
+def find_function_name(lines, line_num):
+    # Look backwards for the most recent function definition (Python example)
+    for i in range(line_num, -1, -1):
+        line = lines[i].strip()
+        # Adjust the regex for different languages as needed
+        if re.match(r"def\s+\w+\(", line):  # Python function detection
+            return line.split("(")[0].replace("def ", "")
+    return "Unknown"
+
+
+def search_repo_for_todos(repo_path, ignore_spec):
+    todos = []
+    # Walk through all files in the directory
+    for root, dirs, files in os.walk(repo_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+
+            if not file.endswith((".gz", ".lock")):  # Adjust to target other extensions if needed
+                if not is_ignored(file_path, ignore_spec, repo_path):
+                    print(f"Processing {file_path}")
+                    todos.extend(find_todos_in_file(file_path))
+                else:
+                    print(f"Ignoring {file_path}")
+            else:
+                print(f"Skipping {file_path}")
+    return todos
+
+
+def write_todos_to_csv(todos, output_file):
+    # Convert the todos list into a pandas DataFrame
+    df = pd.DataFrame(todos, columns=["File", "Function", "Line Number", "Context"])
+
+    # Write the DataFrame to a CSV file
+    df.to_csv(output_file, index=False)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    repo_path = args.repo_path
+    output_file = args.output_file
+
+    # Load .gitignore
+    ignore_spec = load_gitignore(repo_path)
+
+    # Search for TODOs
+    todos = search_repo_for_todos(repo_path, ignore_spec)
+
+    # Write TODOs to CSV
+    write_todos_to_csv(todos, output_file)
+
+    print(f"TODO report generated: {output_file}")
+
+# @TODO: check for \n in the context (readlines keeps them,
+# read().splitlines() removes them)....maybe use read().splitlines()?
+# but might need \n to maintain context
+
+# @TODO: fix bug; sometimes TODO does not lie in a function, it lies in a class.
diff --git a/scripts/2_make_index_categories.py b/scripts/2_make_index_categories.py
@@ -0,0 +1,175 @@
+import numpy as np
+import pandas as pd
+
+
+def numbered_list_to_numpy(list_string):
+    # Split the string into lines
+    lines = list_string.strip().split("\n")
+
+    # Remove whitespace from each line
+    items = [line.strip() for line in lines]
+
+    return np.array(items)
+
+
+df = pd.read_csv("make_index.csv")
+# print(df.shape)
+
+# split into separate csv
+idx = int(df.shape[0] / 2)
+df1 = df.iloc[:idx]
+df2 = df.iloc[idx:]
+
+df1.to_csv("make_index_1.csv")
+df2.to_csv("make_index_2.csv")
+
+# Follow the instructions in scripts/README.md about what
+# Claude prompt to use to generate categories make_index_1 and make_index_2.
+
+# NOTE: You could have Claude output a list of categories and manually add it to the csv yourself.
+# df1_categories = """
+# Refactoring
+# Functionality Enhancement
+# Functionality Enhancement
+# Refactoring
+# Security/Access Control
+# Security/Access Control
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Security/Access Control
+# Security/Access Control
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Optimization
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Optimization
+# Optimization
+# Optimization
+# Optimization
+# Optimization
+# Refactoring
+# Refactoring
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Security/Access Control
+# Optimization
+# Optimization
+# Functionality Enhancement
+# Refactoring
+# Refactoring
+# Refactoring
+# Refactoring
+# Refactoring
+# Refactoring
+# Refactoring
+# Refactoring
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Refactoring
+# Functionality Enhancement
+# Optimization
+# Optimization
+# Refactoring
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# """
+
+# df2_categories = """
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Optimization
+# Optimization
+# Optimization
+# Functionality Enhancement
+# Functionality Enhancement
+# Refactoring
+# Functionality Enhancement
+# Security/Access Control
+# Security/Access Control
+# Security/Access Control
+# Security/Access Control
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Functionality Enhancement
+# Other
+# """
+
+# df1_categories = numbered_list_to_numpy(df1_categories)
+# df2_categories = numbered_list_to_numpy(df2_categories)
+
+# categories = np.concatenate((df1_categories, df2_categories))
+
+# results = pd.concat([df1, df2], axis=0, ignore_index=True)
+# print(df1.shape)
+# print(df2.shape)
+# print(results.shape)
+# print(categories.shape)
+# results["Category"] = categories
+
+# results.to_csv("categories.csv")
diff --git a/scripts/3_make_index_group.py b/scripts/3_make_index_group.py
@@ -0,0 +1,17 @@
+import pandas as pd
+
+df = pd.read_csv("categories.csv")
+df = df.drop(columns=["Unnamed: 0"])
+
+# print(df.shape)
+
+# split into separate csv, only use context and category and filename
+idx = int(df.shape[0] / 2)
+df1 = df.iloc[:idx]
+df2 = df.iloc[idx:]
+
+df1.to_csv("make_index_1.csv")
+df2.to_csv("make_index_2.csv")
+
+# Follow the instructions in scripts/README.md about what Claude prompt
+# to use to generate groups for make_index_1 and make_index_2.
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,45 @@
+# Example pipeline
+```
+python scripts/1_make_index.py --repo_path path_to_ami_repo
+```
+This will output a file `make_index.csv` with the headers `["File", "Function", "Line Number", "Context"]`.
+
+```
+python scripts/2_make_index_categories.py
+```
+Categories are the "type" of the TODO. This script splits `make_index.csv` into 2 separate csvs (`make_index_1.csv` and `make_index_2.csv`). Attach these files with the claude prompt below. 2 separate input csvs are made due to limits on the Claude free plan.
+
+> This is a csv file with a list of todos from a code base. Each todo's file location, function location, line location, and context is given. Categorize the todos into the following categories:
+> * Functionality Enhancement: Enhancing existing functionality or adding new features.
+> * Refactoring: Code reorganization or improving structure.
+> * Security/Access Control: Security-related improvements or access control.
+> * Optimization: Performance or scalability improvements.
+> * Other
+> Give the results as a csv with the same headers as the input csv, but add a column called 'categories'.
+
+Download the output csvs from claude and combine them into a csv called `categories.csv`. This csv will have headings `["File", "Function", "Line Number", "Context", "Category"]`.
+
+```
+python scripts/3_make_index_groups.py
+```
+Group TODOs that seem to refer to the same TODO in `categories.csv`. Again, this script splits the input csv (`categories.csv`) into 2 separate csvs to be processed by Claude. Use the prompts below.
+
+Claude prompt for make_index_1.csv:
+> Given this csv which contains a list of TODOs compiled from a repo. The headers are the file names where each TODO is located, context for each todo,  and the category (functionality enhancement, refactoring, security/access control,  optimization, other). Group the TODOs that appear to refer to the same todo. Output a csv with the same headers as the original csv. but add a column with the  name of the group (or miscelleneous if no group).
+
+Claude prompt for make_index_2.csv: (NOTE: you could add the groupings from make_index_1 or ask Claude to generate new categories)
+> Group the todos in this csv I attached if they appear to refer to the same todos. Make a csv which I can download which contains the same columns as the csv but with an additional group column. The groups can be as follows, but you can add new groups if it doesn't fit into any of these well:
+> - miscellaneous
+> - Optimize S3 Operations
+> - improve user interface
+> - enhance data/time handling
+> - improve taxonomy and classification
+> - enhance ml pipeline
+> - improve data source configuration
+> - optimize database operations
+> - enhance security and access control
+> - improve event grouping
+> - optimization of image processing
+> - refactoring jobs and background tasks
+
+Claude should output 2 csvs which can be combined into a single one. The output csvs should have heading This csv will have headings `["File", "Function", "Line Number", "Context", "Category", "Group"]`.