Skip to content

Commit

Permalink
Serialize file context (#28)
Browse files Browse the repository at this point in the history
* Dump file context

* Serialize file context

* Fix tests

* Add test workflow

* Update dependencies and fix tests

* Fix logging

* Run ruff

* Disable GitRepository
  • Loading branch information
aorwall authored Jul 30, 2024
1 parent 323582b commit 7ccd42a
Show file tree
Hide file tree
Showing 38 changed files with 3,198 additions and 966 deletions.
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
INDEX_STORE_DIR=/tmp/moatless/index-store
REPO_DIR=/tmp/repos
TRAJECTORY_DIR=/tmp/moatless/trajectories
PRONPT_LOG_DIR=/tmp/moatless/prompt_logs
23 changes: 23 additions & 0 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Run Tests

on:
push:
branches: [ '**' ]
pull_request:
branches: [ main ]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Install dependencies
run: poetry install
- name: Run tests
run: poetry run pytest
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,5 @@ cython_debug/
.idea
notebooks/.ipynb_checkpoints/
notebooks/local_experiments.ipynb

playground
360 changes: 360 additions & 0 deletions moatless/benchmark/claude_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
import json
import logging
from typing import Optional

import instructor

from moatless import Transitions
from moatless.benchmark.evaluation import create_evaluation_name, Evaluation
from moatless.edit.edit import EditCode
from moatless.edit.plan import PlanToCode
from moatless.find.decide import DecideRelevance
from moatless.find.identify import IdentifyCode
from moatless.find.search_v2 import SearchCode
from moatless.loop import Transition
from moatless.state import Finished, Rejected
from moatless.transitions import (
search_and_code_transitions,
search_transitions,
code_transitions,
)

# model = "claude-3-5-sonnet-20240620"

# model = "gpt-4o-2024-05-13"
model = "azure/gpt-4o"

# model = "openrouter/anthropic/claude-3.5-sonnet"

global_params = {
"model": model,
"temperature": 0.2,
"max_tokens": 2000,
"max_prompt_file_tokens": 8000,
}

state_params = {
SearchCode: {
"provide_initial_context": True,
"max_search_results": 75,
"initial_context_tokens": 6000,
"initial_search_results": 100,
"initial_context_spans_per_file": 5,
},
IdentifyCode: {"expand_context": True},
DecideRelevance: {
"finish_after_relevant_count": 1,
},
PlanToCode: {
"max_tokens_in_edit_prompt": 750,
"expand_context_with_related_spans": False,
"finish_on_review": True,
},
EditCode: {
"chain_of_thought": False,
"show_file_context": False,
"max_prompt_file_tokens": 8000,
},
}

index_store_dir = f"/home/albert/20240522-voyage-code-2"
repo_base_dir = "/tmp/repos"
evaluations_dir = "/home/albert/repos/albert/moatless/evaluations"

search_and_code = search_and_code_transitions(
global_params=global_params, state_params=state_params
)

identified_spans_but_failed_implementation = [
"django__django-11583",
"django__django-11179",
"django__django-12286",
"django__django-12700",
"django__django-12708",
"django__django-13315",
"django__django-13933",
"django__django-14382",
"django__django-14608",
"django__django-14787",
"django__django-14999",
"django__django-15347",
"django__django-15789",
"django__django-16041",
"django__django-16046",
"django__django-16595",
"matplotlib__matplotlib-26020",
"matplotlib__matplotlib-24149",
"mwaskom__seaborn-3190",
"psf__requests-3362",
"pytest-dev__pytest-5692",
"scikit-learn__scikit-learn-11281",
"django__django-2708",
"scikit-learn__scikit-learn-13241",
"scikit-learn__scikit-learn-13779",
"scikit-learn__scikit-learn-14894",
"scikit-learn__scikit-learn-15535",
"scikit-learn__scikit-learn-25570",
"sympy__sympy-18621",
"sympy__sympy-23117",
"sympy__sympy-22714",
"sympy__sympy-24213",
]

coding_test_set = [
"django__django-11848",
"django__django-12308",
"django__django-12497",
"django__django-13551",
"django__django-13660",
"django__django-14238",
"django__django-14411",
"django__django-14787",
"django__django-16041",
"django__django-17051",
"matplotlib__matplotlib-24149",
"mwaskom__seaborn-3190",
"psf__requests-1963",
"pylint-dev__pylint-6506",
"pylint-dev__pylint-7993",
"pytest-dev__pytest-7432",
"scikit-learn__scikit-learn-13142",
"scikit-learn__scikit-learn-25570",
"sphinx-doc__sphinx-7975",
"sympy__sympy-12481",
"sympy__sympy-14396",
"sympy__sympy-14817",
"sympy__sympy-15609",
"sympy__sympy-16988",
"sympy__sympy-18189",
"sympy__sympy-18532",
"sympy__sympy-21847",
"sympy__sympy-22005",
"sympy__sympy-22714",
"sympy__sympy-24066",
]

search_and_identify_set = [
"matplotlib__matplotlib-25442",
"matplotlib__matplotlib-23562",
"pytest-dev__pytest-11148",
"sphinx-doc__sphinx-8721",
"sphinx-doc__sphinx-10325",
"scikit-learn__scikit-learn-15535",
"scikit-learn__scikit-learn-11281",
"astropy__astropy-6938",
"sympy__sympy-17022",
"sympy__sympy-17139",
"sympy__sympy-13031",
"django__django-15814",
"django__django-15498",
"django__django-12125",
"django__django-13964",
"django__django-11964",
"django__django-14580",
"django__django-17087",
]


def run_evaluation():
max_file_context_lines = 1000

transitions = search_and_code_transitions(
state_params={
PlanToCode: {
"max_prompt_file_tokens": 16000,
"max_tokens_in_edit_prompt": 500,
"max_file_context_lines": max_file_context_lines,
}
},
)


def evaluate_search():
transitions = Transitions(
global_params=global_params,
state_params={
SearchCode: {"max_search_results": 50, "provide_initial_context": True},
},
initial_state=SearchCode,
transitions=[
Transition(source=SearchCode, dest=Finished, trigger="did_search"),
Transition(source=SearchCode, dest=Finished, trigger="finish"),
],
)

evaluation_name = create_evaluation_name(model, "search")

evaluation = Evaluation(
transitions=transitions,
evaluations_dir=evaluations_dir + "/search",
evaluation_name=evaluation_name,
index_store_dir=index_store_dir,
repo_base_dir=repo_base_dir,
max_file_context_tokens=16000,
litellm_callback="langfuse",
detailed_report=True,
)

evaluation.run_evaluation_with_moatless_dataset(use_test_subset=True)


def evaluate_search_and_identify(
resolved_by: Optional[int] = 4,
previous_trajectory_dir: Optional[str] = None,
instance_ids: Optional[list] = None,
):
transitions = search_transitions(
global_params=global_params,
state_params=state_params,
)

evaluation_name = create_evaluation_name("search_and_identify_3", model)
# evaluation_name = "20240624_search_and_identify_claude-3-5-sonnet-20240620"

evaluation = Evaluation(
transitions=transitions,
evaluations_dir=evaluations_dir + "/search_and_identify",
evaluation_name=evaluation_name,
index_store_dir=index_store_dir,
repo_base_dir=repo_base_dir,
previous_trajectory_dir=previous_trajectory_dir,
max_file_context_tokens=16000,
litellm_callback="langfuse",
detailed_report=True,
)

evaluation.run_evaluation_with_moatless_dataset(
resolved_by=resolved_by, instance_ids=instance_ids
)


def evaluate_search_and_code(
resolved_by: Optional[int],
previous_trajectory_dir: Optional[str] = None,
retry_state: Optional[str] = None,
instance_ids: Optional[list] = None,
):
evaluation_name = create_evaluation_name("search_and_code", model)
# evaluation_name = "20240624_search_and_code_2_claude-3-5-sonnet-20240620"
# evaluation_name = "20240623_moatless_claude-3.5-sonnet"

evaluation = Evaluation(
transitions=search_and_code,
evaluations_dir=evaluations_dir + "/search_and_code",
evaluation_name=evaluation_name,
index_store_dir=index_store_dir,
repo_base_dir=repo_base_dir,
previous_trajectory_dir=previous_trajectory_dir,
retry_state=retry_state,
max_file_context_tokens=16000,
num_workers=3,
litellm_callback="langfuse",
detailed_report=True,
)

evaluation.run_evaluation_with_moatless_dataset(
resolved_by=resolved_by,
instance_ids=instance_ids,
)


def evaluate_coding():
evaluation_name = create_evaluation_name("coding", model)
# evaluation_name = "20240623_coding_2_claude-3.5-sonnet"

evaluation = Evaluation(
transitions=code_transitions(
global_params=global_params, state_params=state_params
),
use_expected_file_context=True,
evaluations_dir=evaluations_dir + "/coding",
evaluation_name=evaluation_name,
index_store_dir=index_store_dir,
repo_base_dir=repo_base_dir,
max_file_context_tokens=16000,
litellm_callback="langfuse",
detailed_report=True,
)

df = evaluation.run_evaluation_with_moatless_dataset(instance_ids=coding_test_set)


def evaluate_plan(previous_trajectory_dir: Optional[str] = None):
transitions = Transitions(
global_params=global_params,
state_params={
SearchCode: {
"provide_initial_context": True,
"max_search_results": 75,
"initial_context_tokens": 6000,
"initial_search_results": 100,
"initial_context_spans_per_file": 5,
},
PlanToCode: {
"max_prompt_file_tokens": 16000,
"max_tokens_in_edit_prompt": 750,
"expand_context_with_related_spans": False,
},
},
initial_state=SearchCode,
transitions=[
Transition(source=SearchCode, dest=IdentifyCode, trigger="did_search"),
Transition(source=IdentifyCode, dest=SearchCode, trigger="search"),
Transition(source=IdentifyCode, dest=DecideRelevance, trigger="finish"),
Transition(source=DecideRelevance, dest=SearchCode, trigger="search"),
Transition(
source=DecideRelevance,
dest=PlanToCode,
trigger="finish",
exclude_fields={"message"},
),
Transition(source=PlanToCode, dest=Finished, trigger="edit_code"),
Transition(source=PlanToCode, dest=Rejected, trigger="finish"),
Transition(source=PlanToCode, dest=Rejected, trigger="reject"),
],
)

evaluation_name = create_evaluation_name("search_and_plan_2", model)

evaluation = Evaluation(
transitions=transitions,
evaluations_dir=evaluations_dir + "/search_and_plan",
evaluation_name=evaluation_name,
index_store_dir=index_store_dir,
repo_base_dir=repo_base_dir,
previous_trajectory_dir=previous_trajectory_dir,
retry_state="PlanToCode",
max_file_context_tokens=16000,
litellm_callback="langfuse",
detailed_report=True,
)

df = evaluation.run_evaluation_with_moatless_dataset(
instance_ids=identified_spans_but_failed_implementation
)

# print out instance id and if planned
for instance_id in df.index:
print(df.loc[instance_id, "instance_id"], df.loc[instance_id, "planned"])


if __name__ == "__main__":
logging.basicConfig(
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
level=logging.INFO,
)
logging.getLogger().setLevel(logging.INFO)
logging.getLogger("LiteLLM").setLevel(logging.WARNING)
logging.getLogger("Evaluator").setLevel(logging.INFO)

# evaluate_coding()
# evaluate_search_and_identify()
evaluate_search_and_code(
1,
"/home/albert/repos/albert/moatless/evaluations/20240623_moatless_claude-3.5-sonnet/trajs",
retry_state="PlanToCode",
)
# evaluate_search_and_code()
# evaluate_search_and_code(
# # "/home/albert/repos/albert/moatless/evaluations/search_and_code/20240622_search_and_code_6_claude-3.5-sonnet/trajs"
# )
File renamed without changes.
Loading

0 comments on commit 7ccd42a

Please sign in to comment.