Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] Summarize all todos in the codebase #581

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions scripts/1_make_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Walk through all files in the repo_path and output a csv file with all @TODO tags and their context
import argparse
import os
import re

import pandas as pd
import pathspec

parser = argparse.ArgumentParser()
parser.add_argument("--repo_path", type=str)
parser.add_argument("--output_file", default="make_index.csv", type=str)

# Define the number of lines before and after the TODO to include in context
CONTEXT_LINES = 5


def load_gitignore(repo_path):
gitignore_path = os.path.join(repo_path, ".gitignore")
if os.path.exists(gitignore_path):
with open(gitignore_path, encoding="utf-8") as f:
patterns = f.read().splitlines()
return pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, patterns)
return None


def is_ignored(file_path, ignore_spec, repo_path):
if ignore_spec:
relative_path = os.path.relpath(file_path, repo_path)
return ignore_spec.match_file(relative_path)
return False


def find_todos_in_file(file_path):
todos = []
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()

for line_num, line in enumerate(lines):
if "@TODO" in line:
# Find function name (if exists)
func_name = find_function_name(lines, line_num)

# Extract context
start = max(0, line_num - CONTEXT_LINES)
end = min(len(lines), line_num + CONTEXT_LINES + 1)
context = "".join(lines[start:end])

todos.append(
{"File": file_path, "Function": func_name, "Line Number": line_num + 1, "Context": context.strip()}
)
return todos


def find_function_name(lines, line_num):
# Look backwards for the most recent function definition (Python example)
for i in range(line_num, -1, -1):
line = lines[i].strip()
# Adjust the regex for different languages as needed
if re.match(r"def\s+\w+\(", line): # Python function detection
return line.split("(")[0].replace("def ", "")
return "Unknown"


def search_repo_for_todos(repo_path, ignore_spec):
todos = []
# Walk through all files in the directory
for root, dirs, files in os.walk(repo_path):
for file in files:
file_path = os.path.join(root, file)

if not file.endswith((".gz", ".lock")): # Adjust to target other extensions if needed
if not is_ignored(file_path, ignore_spec, repo_path):
print(f"Processing {file_path}")
todos.extend(find_todos_in_file(file_path))
else:
print(f"Ignoring {file_path}")
else:
print(f"Skipping {file_path}")
return todos


def write_todos_to_csv(todos, output_file):
# Convert the todos list into a pandas DataFrame
df = pd.DataFrame(todos, columns=["File", "Function", "Line Number", "Context"])

# Write the DataFrame to a CSV file
df.to_csv(output_file, index=False)


if __name__ == "__main__":
args = parser.parse_args()

repo_path = args.repo_path
output_file = args.output_file

# Load .gitignore
ignore_spec = load_gitignore(repo_path)

# Search for TODOs
todos = search_repo_for_todos(repo_path, ignore_spec)

# Write TODOs to CSV
write_todos_to_csv(todos, output_file)

print(f"TODO report generated: {output_file}")

# @TODO: check for \n in the context (readlines keeps them,
# read().splitlines() removes them)....maybe use read().splitlines()?
# but might need \n to maintain context

# @TODO: fix bug; sometimes TODO does not lie in a function, it lies in a class.
175 changes: 175 additions & 0 deletions scripts/2_make_index_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import numpy as np
import pandas as pd


def numbered_list_to_numpy(list_string):
# Split the string into lines
lines = list_string.strip().split("\n")

# Remove whitespace from each line
items = [line.strip() for line in lines]

return np.array(items)


df = pd.read_csv("make_index.csv")
# print(df.shape)

# split into separate csv
idx = int(df.shape[0] / 2)
df1 = df.iloc[:idx]
df2 = df.iloc[idx:]

df1.to_csv("make_index_1.csv")
df2.to_csv("make_index_2.csv")

# Follow the instructions in scripts/README.md about what
# Claude prompt to use to generate categories make_index_1 and make_index_2.

# NOTE: You could have Claude output a list of categories and manually add it to the csv yourself.
# df1_categories = """
# Refactoring
# Functionality Enhancement
# Functionality Enhancement
# Refactoring
# Security/Access Control
# Security/Access Control
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Security/Access Control
# Security/Access Control
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Optimization
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Optimization
# Optimization
# Optimization
# Optimization
# Optimization
# Refactoring
# Refactoring
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Security/Access Control
# Optimization
# Optimization
# Functionality Enhancement
# Refactoring
# Refactoring
# Refactoring
# Refactoring
# Refactoring
# Refactoring
# Refactoring
# Refactoring
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Refactoring
# Functionality Enhancement
# Optimization
# Optimization
# Refactoring
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# """

# df2_categories = """
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Optimization
# Optimization
# Optimization
# Functionality Enhancement
# Functionality Enhancement
# Refactoring
# Functionality Enhancement
# Security/Access Control
# Security/Access Control
# Security/Access Control
# Security/Access Control
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Functionality Enhancement
# Other
# """

# df1_categories = numbered_list_to_numpy(df1_categories)
# df2_categories = numbered_list_to_numpy(df2_categories)

# categories = np.concatenate((df1_categories, df2_categories))

# results = pd.concat([df1, df2], axis=0, ignore_index=True)
# print(df1.shape)
# print(df2.shape)
# print(results.shape)
# print(categories.shape)
# results["Category"] = categories

# results.to_csv("categories.csv")
17 changes: 17 additions & 0 deletions scripts/3_make_index_group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd

df = pd.read_csv("categories.csv")
df = df.drop(columns=["Unnamed: 0"])

# print(df.shape)

# split into separate csv, only use context and category and filename
idx = int(df.shape[0] / 2)
df1 = df.iloc[:idx]
df2 = df.iloc[idx:]

df1.to_csv("make_index_1.csv")
df2.to_csv("make_index_2.csv")

# Follow the instructions in scripts/README.md about what Claude prompt
# to use to generate groups for make_index_1 and make_index_2.
45 changes: 45 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Example pipeline
```
python scripts/1_make_index.py --repo_path path_to_ami_repo
```
This will output a file `make_index.csv` with the headers `["File", "Function", "Line Number", "Context"]`.

```
python scripts/2_make_index_categories.py
```
Categories are the "type" of the TODO. This script splits `make_index.csv` into 2 separate csvs (`make_index_1.csv` and `make_index_2.csv`). Attach these files with the claude prompt below. 2 separate input csvs are made due to limits on the Claude free plan.

> This is a csv file with a list of todos from a code base. Each todo's file location, function location, line location, and context is given. Categorize the todos into the following categories:
> * Functionality Enhancement: Enhancing existing functionality or adding new features.
> * Refactoring: Code reorganization or improving structure.
> * Security/Access Control: Security-related improvements or access control.
> * Optimization: Performance or scalability improvements.
> * Other
> Give the results as a csv with the same headers as the input csv, but add a column called 'categories'.

Download the output csvs from claude and combine them into a csv called `categories.csv`. This csv will have headings `["File", "Function", "Line Number", "Context", "Category"]`.

```
python scripts/3_make_index_groups.py
```
Group TODOs that seem to refer to the same TODO in `categories.csv`. Again, this script splits the input csv (`categories.csv`) into 2 separate csvs to be processed by Claude. Use the prompts below.

Claude prompt for make_index_1.csv:
> Given this csv which contains a list of TODOs compiled from a repo. The headers are the file names where each TODO is located, context for each todo, and the category (functionality enhancement, refactoring, security/access control, optimization, other). Group the TODOs that appear to refer to the same todo. Output a csv with the same headers as the original csv. but add a column with the name of the group (or miscelleneous if no group).

Claude prompt for make_index_2.csv: (NOTE: you could add the groupings from make_index_1 or ask Claude to generate new categories)
> Group the todos in this csv I attached if they appear to refer to the same todos. Make a csv which I can download which contains the same columns as the csv but with an additional group column. The groups can be as follows, but you can add new groups if it doesn't fit into any of these well:
> - miscellaneous
> - Optimize S3 Operations
> - improve user interface
> - enhance data/time handling
> - improve taxonomy and classification
> - enhance ml pipeline
> - improve data source configuration
> - optimize database operations
> - enhance security and access control
> - improve event grouping
> - optimization of image processing
> - refactoring jobs and background tasks

Claude should output 2 csvs which can be combined into a single one. The output csvs should have heading This csv will have headings `["File", "Function", "Line Number", "Context", "Category", "Group"]`.