diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..3e85e0983
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,189 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Mac files
+*.DS_Store
+
+# Custom
+keys.cfg
+
+# iPython Notebooks
+*.ipynb
+
+# Evaluation folders
+results/
+testbed/
+temp/
+
+# Ignore all YAML files in data/
+data/*/ic-*
+data/*/single-issues
+
+# Fine tuning data
+fine_tune/*.ipynb
+fine_tune/subtasks/*.jsonl
+temp*.jsonl
+
+# Inspector
+inspector/*.json
+
+# Ignore all files in the private folder
+private/
+
+### Website
+
+# dependencies
+website/frontend/node_modules
+website/frontend/package-lock.json
+website/frontend/.pnp
+*.pnp.js
+
+# testing
+website/frontend/coverage
+
+# production
+website/frontend/build
+
+# misc
+*.env.local
+*.env.development.local
+*.env.test.local
+*.env.production.local
+.api_key
+*npm-debug.log*
+*yarn-debug.log*
+*yarn-error.log*
+
+
+# demo yamls (for editing)
+*.demo.yaml
+
+# trajectory files
+trajectories/*
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..e702436e2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 John Yang, Carlos E. Jimenez, Alexander Wettig, Shunyu Yao, Karthik Narasimhan, Ofir Press
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..1fd5d174d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,98 @@
+
+
+
+## ๐ Overview
+SWE-agent turns LMs (e.g. GPT-4) into software engineering agents that can fix bugs and issues in real GitHub repositories.
+
+On the full [SWE-bench](https://github.com/princeton-nlp/SWE-bench) test set, SWE-agent resolves **12.29%** of issues, achieving the state-of-the-art performance on the full test set.
+
+### โจ Agent-Computer Interface (ACI)
+We accomplish these results by designing simple LM-centric commands and feedback formats to make it easier for the LM to browse the repository, view, edit and execute code files. We call this an **Agent-Computer Interface** (ACI) and build the SWE-agent repository to make it easy to iterate on ACI design for repository-level coding agents.
+
+Just like how typical language models requires good prompt engineering, good ACI design leads to much better results when using agents. As we show in our paper, a baseline agent without a well-tuned ACI does much worse than SWE-agent.
+
+SWE-agent contains features that we discovered to be immensly helpful during the agent-computer interface design process:
+1. We add a linter that runs when an edit command is issued, and do not let the edit command go through if the code isn't syntactically correct.
+2. We supply the agent with a special-built file viewer, instead of having it just ```cat``` files. We found that this file viewer works best when displaying just 100 lines in each turn. The file editor that we built has commands for scrolling up and down and for performing a search within the file.
+3. We supply the agent with a special-built full-directory string searching command. We found that it was important for this tool to succintly list the matches- we simply list each file that had at least one match. Showing the model more context about each match proved to be too confusing for the model.
+4. When commands have an empty output we return a message saying "Your command ran successfully and did not produce any output."
+
+Read our paper for more details.
+
+```
+@misc{yang2024sweagent,
+ title={SWE-agent: Agent Computer Interfaces Enable Software Engineering Language Models},
+ author={John Yang and Carlos E. Jimenez and Alexander Wettig and Shunyu Yao and Karthik Narasimhan and Ofir Press},
+ year={2024},
+}
+```
+
+## ๐ Setup
+1. [Install Docker](https://docs.docker.com/engine/install/), then start Docker locally.
+2. [Install Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/), then create the `swe-agent` environment with `conda env create -f environment.yml`
+3. Activate using `conda activate swe-agent`.
+4. Run `./setup.sh` to create the `swe-agent` docker image.
+5. Create a `keys.cfg` file at the root of this repository and fill in the following:
+```
+OPENAI_API_KEY: 'OpenAI API Key Here if using OpenAI Model (optional)'
+ANTHROPIC_API_KEY: 'Anthropic API Key Here if using Anthropic Model (optional)'
+GITHUB_TOKEN: 'GitHub Token Here (required)'
+```
+See the following links for tutorials on obtaining [Anthropic](https://docs.anthropic.com/claude/reference/getting-started-with-the-api), [OpenAI](https://platform.openai.com/docs/quickstart/step-2-set-up-your-api-key), and [Github]() tokens.
+
+## ๐ฝ Usage
+There are two steps to the SWE-agent pipeline. First SWE-agent takes an input GitHub issue and returns a pull request that attempts to fix it. We call that step *inference*. The second step (currently, only available for issues in the SWE-bench benchmark) is to *evaluate* the pull request to verify that it has indeed fixed the issue.
+
+### ๐ฉโ๐ป Inference
+**Inference on *any* GitHub Issue**: Using this script, you can run SWE-agent on any GitHub issue!
+```
+python run.py --model_name gpt4 \
+ --data_path https://github.com/pvlib/pvlib-python/issues/1603 --config_file config/default_from_url.yaml
+```
+
+**Inference on SWE-bench**: Run SWE-agent on [SWE-bench Lite](https://www.swebench.com/lite.html) and generate patches.
+```
+python run.py --model_name gpt4 \
+ --per_instance_cost_limit 2.00 \
+ --config_file ./config/default.yaml
+```
+
+If you'd like to run on a *single* issue from SWE-bench, use the `--instance_filter` option as follows:
+```
+python run.py --model_name gpt4 \
+ --instance_filter marshmallow-code__marshmallow-1359
+```
+* See the [`scripts/`](scripts/) folder for other useful scripts and details.
+* See the [`config/`](config/) folder for details about how you can define your own configuration!
+* See the [`swe-agent/agent/`](agent/) folder for details about the logic behind configuration based workflows.
+* See the [`swe-agent/environment/`](swe-agent/environment/) folder for details about the `SWEEnv` environment (interface + implementation).
+* See the [`trajectories/`](trajectories) folder for details about the output of `run.py`.
+
+### ๐งช Evaluation
+This step is only available for issues from the SWE-bench set. To evaluate generated pull requests:
+```
+cd evaluation/
+./run_eval.sh
+```
+Replace `` with the path to the model's predictions, which should be generated from the *Inference* step. The `` arguments should look like `../trajectories//--/all_preds.jsonl`
+* See the [`evaluation/`](evaluation/) folder for details about how evaluation works.
+
+
+## ๐ซ Contributions
+- If you'd like to ask questions, learn about upcoming features, and participate in future development, join our [Discord community](https://discord.gg/AVEFbBn2rH)!
+- If you'd like to contribute to the codebase, we welcome [issues](https://github.com/princeton-nlp/SWE-agent/issues) and [pull requests](https://github.com/princeton-nlp/SWE-agent/pulls)!
+- If you'd like to see a post or tutorial about some topic, please let us know via an [issue](https://github.com/princeton-nlp/SWE-agent/issues).
+
+## ๐ชช License
+MIT. Check `LICENSE`.
diff --git a/assets/inspector.png b/assets/inspector.png
new file mode 100644
index 000000000..7a071c543
Binary files /dev/null and b/assets/inspector.png differ
diff --git a/assets/swe-agent-banner.png b/assets/swe-agent-banner.png
new file mode 100644
index 000000000..09474356e
Binary files /dev/null and b/assets/swe-agent-banner.png differ
diff --git a/build_deploy.sh b/build_deploy.sh
new file mode 100755
index 000000000..6b46d126a
--- /dev/null
+++ b/build_deploy.sh
@@ -0,0 +1,6 @@
+# !bin/bash
+
+python3 -m build
+
+python3 -m twine upload --skip-existing --repository pypi dist/*
+# python3 -m twine upload --skip-existing --repository testpypi dist/*
diff --git a/config/README.md b/config/README.md
new file mode 100644
index 000000000..17719f21e
--- /dev/null
+++ b/config/README.md
@@ -0,0 +1,78 @@
+# Configuration
+
+This folder contains details describing how to write your own configurations to control how agents can interact with the `SWEEnv` environment.
+A configuration is represented as a single `.yaml` file, allowing you to...
+* Define the **commands** that agents may use to traverse + modify a codebase.
+* Write **prompts** that are determiniscally/conditionally shown to the agent over the course of a single trajectory.
+* Control the **input/output interface** that sits between the agent and `SWEEnv`.
+
+## Configuration File Fields
+The configuration is a `.yaml` file that consists of several fields. They are fully represented in this following outline:
+
+```yaml
+# Prompt Templates: Control how observations of environment are shown to agent
+system_template: | # .yaml syntax for multi-line string value
+ First `system` message shown to agent
+instance_template: |- # .yaml syntax for multi-line string value w/ no new line
+ Instance prompt, contains task instance-specific content
+next_step_template: |-
+ Format template of per-turn observation (Contains standard output from agent's action)
+next_step_no_output_template: |-
+ Format template of observation when there is no standard output from the agent's action
+format_error_template: |-
+ Format template of error message (Used when agent's action causes an error)
+demonstration_template: |
+ Format template for showing a demonstration to the agent
+demonstrations:
+- `trajectories///*.traj`
+- File is a demonstration of how to solve a task. This could an agent generated trajectory.
+- You can include 1+ demonstrations
+
+# Environment States: Define features of the SWEEnv environment
+env_variables:
+# Default variables for SWEEnv at the beginning of each instance
+ CURRENT_FILE: 0
+ CURRENT_LINE:
+ OVERLAP:
+ SEARCH_FILES:
+ SEARCH_INDEX:
+ SEARCH_RESULTS:
+ WINDOW_SIZE:
+ START_INDEX:
+ END_INDEX:
+ START_CURSOR:
+ END_CUROSR:
+ START_CURSORS_MARK:
+ END_CURSOR_MARK:
+state_command: |
+# `state_command` allows you to update state variables to reflect any aspect of the environment (e.g. current working directory)
+ name: state
+ code: |
+ state() { echo '{"pwd": "'$PWD'"}';
+
+# Action Interface: Define how an agent interacts with the SWEEnv environment
+command_files:
+- path/to/bash_file.sh
+- Each file contains a list of commands implemented in bash
+- You can include 1+ command files
+parse_command: Reference to functionality for defining command documentation
+history_processor: Reference to functionality for controlling agent's message history
+parse_function: Parser run on agent output
+```
+
+We recommend looking at...
+* `configs/` for examples of properly formatted configuration files. Each configuration differs in its set of commands, input/output format, demonstrations, etc.
+* `commands/` for the bash implementations of the custom commands that SWE-agent uses to navigate + edit the codebase.
+
+## How a Configuration File is Processed
+Some notes on processing that occurs on config fields when SWE-agent is run:
+* Commands specified in `command_files` will be parsed into a single block of documentation text that can be referenced as `{command_docs}`.
+* `env_variables` are the default variables for the bash environment at the beginning of each instance.
+* `state_command` is used to extract state information from the bash environment (formatted as json) to be used in the templates given to the agent.
+
+Possible variables that can be used in templates are:
+- `{command_docs}` (an automatically compiled collection of available commands + their docstrings)
+- any variable given in `env_variables` (same spelling), e.g., `{WINDOW_SIZE}`
+- any variable extracted as json as part of the `state_command` function
+- the last observation `{observation}`
+- ... this list will grow as we implement more features!
\ No newline at end of file
diff --git a/config/commands/_split_string.py b/config/commands/_split_string.py
new file mode 100644
index 000000000..3ceccc15e
--- /dev/null
+++ b/config/commands/_split_string.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys
+
+def print_flake8_output(input_string, show_line_numbers=False):
+ for value in input_string.split("\n"):
+ parts = value.split()
+ if not show_line_numbers:
+ print(f"- {' '.join(parts[1:])}")
+ else:
+ line_nums = ":".join(parts[0].split(":")[1:])
+ print(f"- {line_nums} {' '.join(parts[1:])}")
+
+if __name__ == "__main__":
+ lint_output = sys.argv[1]
+ print_flake8_output(lint_output)
diff --git a/config/commands/cursors_defaults.sh b/config/commands/cursors_defaults.sh
new file mode 100644
index 000000000..f6168b134
--- /dev/null
+++ b/config/commands/cursors_defaults.sh
@@ -0,0 +1,306 @@
+_reset_cursors() {
+ export START_CURSOR=1
+ export END_CURSOR=1
+}
+
+_constrain_cursors() {
+ # constrain the cursors to be within the bounds of the file [0, total_lines+1]
+ local total_lines=$(awk 'END {print NR}' "$CURRENT_FILE")
+ total_lines=$((total_lines < 1 ? 1 : total_lines)) # if the file is empty, set total_lines to 1
+ local start_line=$((CURRENT_LINE - WINDOW / 2))
+ local end_line=$((CURRENT_LINE + WINDOW / 2))
+ start_line=$((start_line < 1 ? 1 : start_line))
+ end_line=$((end_line > total_lines ? total_lines : end_line))
+ local warning_string=""
+ if [ "$START_CURSOR" -lt "$start_line" ]; then
+ warning_string+="START_CURSOR moved to $start_line\n"
+ START_CURSOR=$start_line
+ elif [ "$START_CURSOR" -gt "$end_line" ]; then
+ START_CURSOR=$end_line
+ warning_string+="START_CURSOR moved to $end_line\n"
+ fi
+ if [ "$END_CURSOR" -lt "$start_line" ]; then
+ warning_string+="END_CURSOR moved to $start_line\n"
+ END_CURSOR=$start_line
+ elif [ "$END_CURSOR" -gt "$end_line" ]; then
+ warning_string+="END_CURSOR moved to $end_line\n"
+ END_CURSOR=$end_line
+ fi
+ export START_CURSOR END_CURSOR
+ echo "$warning_string"
+ echo $START_CURSOR $END_CURSOR
+}
+
+_print() {
+ local cursor_warning=$(_constrain_cursors)
+ local cursor_values=$(echo "$cursor_warning" | tail -n 1)
+ cursor_warning=$(echo "$cursor_warning" | head -n -1)
+ export START_CURSOR=$(echo "$cursor_values" | awk '{print $1}')
+ export END_CURSOR=$(echo "$cursor_values" | awk '{print $2}')
+ local total_lines=$(awk 'END {print NR}' $CURRENT_FILE)
+ echo "[File: $(realpath "$CURRENT_FILE") ($total_lines lines total)]"
+ local start_line=$((CURRENT_LINE - WINDOW / 2))
+ local end_line=$((CURRENT_LINE + WINDOW / 2))
+ start_line=$((start_line < 1 ? 1 : start_line))
+ end_line=$((end_line > total_lines ? total_lines : end_line))
+ local lines=()
+ local i=0
+ while IFS= read -r line; do
+ lines[i++]="$line"
+ done < <(awk -v start="$start_line" -v end="$end_line" 'NR>=start && NR<=end {print}' "$CURRENT_FILE")
+ local num_lines=${#lines[@]}
+ if [ $start_line -gt 1 ]; then
+ echo "($((start_line - 1)) more lines above)"
+ fi
+ for ((i=0; i
+# docstring: sets the start and end cursors to the given line numbers
+# arguments:
+# start_line:
+# type: integer
+# description: the line number to set the start cursor to
+# required: true
+# end_line:
+# type: integer
+# description: the line number to set the end cursor to
+# required: true
+set_cursors() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ if [ $# -lt 2 ]
+ then
+ echo "Usage: set_cursors "
+ return
+ fi
+ local start_line=$1
+ local end_line=$2
+ local re='^[0-9]+$'
+ if ! [[ $start_line =~ $re ]]
+ then
+ echo "Usage: set_cursors "
+ echo "Error: start_line must be a number"
+ return
+ fi
+ if ! [[ $end_line =~ $re ]]
+ then
+ echo "Usage: set_cursors "
+ echo "Error: end_line must be a number"
+ return
+ fi
+ if [ $start_line -gt $end_line ]
+ then
+ echo "Usage: set_cursors "
+ echo "Error: start_line must be less than or equal to end_line"
+ return
+ fi
+ export START_CURSOR=$start_line
+ export END_CURSOR=$end_line
+ _print
+}
+
+# @yaml
+# signature: open []
+# docstring: opens the file at the given path in the editor. If line_number is provided, the window will be centered on that line
+# arguments:
+# path:
+# type: string
+# description: the path to the file to open
+# required: true
+# line_number:
+# type: integer
+# description: the line number to move the window to (if not provided, the window will start at the top of the file)
+# required: false
+open() {
+ if [ -z "$1" ]
+ then
+ echo "Usage: open "
+ return
+ fi
+ # Check if the second argument is provided
+ if [ -n "$2" ]; then
+ # Check if the provided argument is a valid number
+ if ! [[ $2 =~ ^[0-9]+$ ]]; then
+ echo "Usage: open []"
+ echo "Error: must be a number"
+ return # Exit if the line number is not valid
+ fi
+ local max_line=$(awk 'END {print NR}' $1)
+ if [ $2 -gt $max_line ]; then
+ echo "Warning: ($2) is greater than the number of lines in the file ($max_line)"
+ echo "Warning: Setting to $max_line"
+ local line_number=$(jq -n "$max_line") # Set line number to max if greater than max
+ elif [ $2 -lt 1 ]; then
+ echo "Warning: ($2) is less than 1"
+ echo "Warning: Setting to 1"
+ local line_number=$(jq -n "1") # Set line number to 1 if less than 1
+ else
+ local line_number=$(jq -n "$2") # Set line number if valid
+ fi
+ else
+ local line_number=$(jq -n "$WINDOW/2") # Set default line number if not provided
+ fi
+
+ if [ -f "$1" ]; then
+ export CURRENT_FILE=$(realpath $1)
+ export CURRENT_LINE=$line_number
+ _constrain_line
+ _print
+ else
+ echo "File $1 not found"
+ fi
+}
+
+# @yaml
+# signature: scroll_down
+# docstring: moves the window down {WINDOW} lines
+scroll_down() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ export CURRENT_LINE=$(jq -n "$CURRENT_LINE + $WINDOW - $OVERLAP")
+ _constrain_line
+ _print
+}
+
+# @yaml
+# signature: scroll_up
+# docstring: moves the window up {WINDOW} lines
+scroll_up() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ export CURRENT_LINE=$(jq -n "$CURRENT_LINE - $WINDOW + $OVERLAP")
+ _constrain_line
+ _print
+}
+
+# @yaml
+# signature: goto
+# docstring: moves the window to show
+# arguments:
+# line_number:
+# type: integer
+# description: the line number to move the window to
+# required: true
+goto() {
+ if [ $# -gt 1 ]; then
+ echo "goto allows only one line number at a time."
+ return
+ fi
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ if [ -z "$1" ]
+ then
+ echo "Usage: goto "
+ return
+ fi
+ if ! [[ $1 =~ ^[0-9]+$ ]]
+ then
+ echo "Usage: goto "
+ echo "Error: must be a number"
+ return
+ fi
+ local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
+ if [ $1 -gt $max_line ]
+ then
+ echo "Error: must be less than or equal to $max_line"
+ return
+ fi
+ local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor')
+ export CURRENT_LINE=$(jq -n "[$1 + $WINDOW/2 - $OFFSET, 1] | max | floor")
+ _constrain_line
+ _print
+}
+
+# @yaml
+# signature: create
+# docstring: creates and opens a new file with the given name
+# arguments:
+# filename:
+# type: string
+# description: the name of the file to create
+# required: true
+create() {
+ if [ -z "$1" ]; then
+ echo "Usage: create "
+ return
+ fi
+
+ # Check if the file already exists
+ if [ -e "$1" ]; then
+ echo "Error: File '$1' already exists."
+ open "$1"
+ return
+ fi
+
+ # Create the file an empty new line
+ printf "\n" > "$1"
+ # Use the existing open command to open the created file
+ open "$1"
+}
+
+# @yaml
+# signature: submit
+# docstring: submits your current code and terminates the session
+submit() {
+ cd $ROOT
+
+ # Check if the patch file exists and is non-empty
+ if [ -s "/root/test.patch" ]; then
+ # Apply the patch in reverse
+ git apply -R < "/root/test.patch"
+ fi
+
+ git add -A
+ git diff --cached > model.patch
+ echo "<>"
+}
\ No newline at end of file
diff --git a/config/commands/cursors_edit_linting.sh b/config/commands/cursors_edit_linting.sh
new file mode 100644
index 000000000..4357043ac
--- /dev/null
+++ b/config/commands/cursors_edit_linting.sh
@@ -0,0 +1,101 @@
+# @yaml
+# signature: |-
+# edit
+#
+# end_of_edit
+# docstring: replaces *all* of the text between the START CURSOR and the END CURSOR with the replacement_text. The replacement text is terminated by a line with only end_of_edit on it. All of the will be entered, so make sure your indentation is formatted properly. To enter text at the beginning of the file, set START CURSOR and END CURSOR to 0. Use set_cursors to move the cursors around. Python files will be checked for syntax errors after the edit.
+# end_name: end_of_edit
+# arguments:
+# replacement_text:
+# type: string
+# description: the text to replace the current selection with
+# required: true
+edit() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo 'No file open. Use the `open` command first.'
+ return
+ fi
+ local start_line=$((START_CURSOR - 1))
+ start_line=$((start_line < 0 ? 0 : start_line))
+ local end_line=$((END_CURSOR))
+ end_line=$((end_line < 0 ? 0 : end_line))
+
+ local replacement=()
+ while IFS= read -r line
+ do
+ replacement+=("$line")
+ done
+
+ local num_lines=${#replacement[@]}
+ # Create a backup of the current file
+ cp "$CURRENT_FILE" "/root/$(basename "$CURRENT_FILE")_backup"
+ # Read the file line by line into an array
+ mapfile -t lines < "$CURRENT_FILE"
+ local new_lines=("${lines[@]:0:$start_line}" "${replacement[@]}" "${lines[@]:$((end_line))}")
+ # Write the new stuff directly back into the original file
+ printf "%s\n" "${new_lines[@]}" >| "$CURRENT_FILE"
+ # Run linter
+ if [[ $CURRENT_FILE == *.py ]]; then
+ lint_output=$(flake8 --select=F821,F822,F831,E111,E112,E113,E999,E902 "$CURRENT_FILE" 2>&1)
+ else
+ # do nothing
+ lint_output=""
+ fi
+ # if there is no output, then the file is good
+ if [ -z "$lint_output" ]; then
+ _constrain_line
+ # set to START + num_lines - 1, unless num_lines is 0, then set to START
+ export END_CURSOR=$((num_lines == 0 ? START_CURSOR : START_CURSOR + num_lines - 1))
+ export START_CURSOR=$START_CURSOR
+ _print
+ echo "File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary."
+ else
+ echo "Your proposed edit has introduced new syntax error(s). Please understand the fixes and retry your edit commmand."
+ echo ""
+ echo "ERRORS:"
+ _split_string "$lint_output"
+ echo ""
+
+ # Save original values
+ original_current_line=$CURRENT_LINE
+ original_window=$WINDOW
+ original_end_cursor=$END_CURSOR
+
+ # Update values
+ export CURRENT_LINE=$(( (num_lines / 2) + start_line )) # Set to "center" of edit
+ export WINDOW=$((num_lines + 10)) # Show +/- 5 lines around edit
+ export END_CURSOR=$((num_lines == 0 ? START_CURSOR : START_CURSOR + num_lines - 1))
+
+ echo "This is how your edit would have looked if applied"
+ echo "-------------------------------------------------"
+ _constrain_line
+ _print
+ echo "-------------------------------------------------"
+ echo ""
+
+ # Restoring CURRENT_FILE to original contents.
+ cp "/root/$(basename "$CURRENT_FILE")_backup" "$CURRENT_FILE"
+
+ export CURRENT_LINE=$(( ((end_line - start_line) / 2) + start_line )) # Set to "center" of edit
+ export WINDOW=$((end_line - start_line + 10))
+ export END_CURSOR=$original_end_cursor
+
+ echo "This is the original code before your edit"
+ echo "-------------------------------------------------"
+ _constrain_line
+ _print
+ echo "-------------------------------------------------"
+
+ # Restore original values
+ export CURRENT_LINE=$original_current_line
+ export WINDOW=$original_window
+ export END_CURSOR=$original_end_cursor
+
+ echo "Your changes have NOT been applied. Please fix your edit command and try again."
+ echo "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code."
+ echo "DO NOT re-run the same failed edit command. Running it again will lead to the same error."
+ fi
+ # Remove backup file
+ rm -f "/root/$(basename "$CURRENT_FILE")_backup"
+}
diff --git a/config/commands/defaults.sh b/config/commands/defaults.sh
new file mode 100644
index 000000000..7294a732b
--- /dev/null
+++ b/config/commands/defaults.sh
@@ -0,0 +1,195 @@
+_print() {
+ local total_lines=$(awk 'END {print NR}' $CURRENT_FILE)
+ echo "[File: $(realpath $CURRENT_FILE) ($total_lines lines total)]"
+ lines_above=$(jq -n "$CURRENT_LINE - $WINDOW/2" | jq '[0, .] | max | floor')
+ lines_below=$(jq -n "$total_lines - $CURRENT_LINE - $WINDOW/2" | jq '[0, .] | max | round')
+ if [ $lines_above -gt 0 ]; then
+ echo "($lines_above more lines above)"
+ fi
+ cat $CURRENT_FILE | grep -n $ | head -n $(jq -n "[$CURRENT_LINE + $WINDOW/2, $WINDOW/2] | max | floor") | tail -n $(jq -n "$WINDOW")
+ if [ $lines_below -gt 0 ]; then
+ echo "($lines_below more lines below)"
+ fi
+}
+
+_constrain_line() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
+ local half_window=$(jq -n "$WINDOW/2" | jq 'floor')
+ export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $max_line - $half_window] | min")
+ export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $half_window] | max")
+}
+
+# @yaml
+# signature: open []
+# docstring: opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line
+# arguments:
+# path:
+# type: string
+# description: the path to the file to open
+# required: true
+# line_number:
+# type: integer
+# description: the line number to move the window to (if not provided, the window will start at the top of the file)
+# required: false
+open() {
+ if [ -z "$1" ]
+ then
+ echo "Usage: open "
+ return
+ fi
+ # Check if the second argument is provided
+ if [ -n "$2" ]; then
+ # Check if the provided argument is a valid number
+ if ! [[ $2 =~ ^[0-9]+$ ]]; then
+ echo "Usage: open []"
+ echo "Error: must be a number"
+ return # Exit if the line number is not valid
+ fi
+ local max_line=$(awk 'END {print NR}' $1)
+ if [ $2 -gt $max_line ]; then
+ echo "Warning: ($2) is greater than the number of lines in the file ($max_line)"
+ echo "Warning: Setting to $max_line"
+ local line_number=$(jq -n "$max_line") # Set line number to max if greater than max
+ elif [ $2 -lt 1 ]; then
+ echo "Warning: ($2) is less than 1"
+ echo "Warning: Setting to 1"
+ local line_number=$(jq -n "1") # Set line number to 1 if less than 1
+ else
+ local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor')
+ local line_number=$(jq -n "[$2 + $WINDOW/2 - $OFFSET, 1] | max | floor")
+ fi
+ else
+ local line_number=$(jq -n "$WINDOW/2") # Set default line number if not provided
+ fi
+
+ if [ -f "$1" ]; then
+ export CURRENT_FILE=$(realpath $1)
+ export CURRENT_LINE=$line_number
+ _constrain_line
+ _print
+ elif [ -d "$1" ]; then
+ echo "Error: $1 is a directory. You can only open files. Use cd or ls to navigate directories."
+ else
+ echo "File $1 not found"
+ fi
+}
+
+# @yaml
+# signature: goto
+# docstring: moves the window to show
+# arguments:
+# line_number:
+# type: integer
+# description: the line number to move the window to
+# required: true
+goto() {
+ if [ $# -gt 1 ]; then
+ echo "goto allows only one line number at a time."
+ return
+ fi
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ if [ -z "$1" ]
+ then
+ echo "Usage: goto "
+ return
+ fi
+ if ! [[ $1 =~ ^[0-9]+$ ]]
+ then
+ echo "Usage: goto "
+ echo "Error: must be a number"
+ return
+ fi
+ local max_line=$(awk 'END {print NR}' $CURRENT_FILE)
+ if [ $1 -gt $max_line ]
+ then
+ echo "Error: must be less than or equal to $max_line"
+ return
+ fi
+ local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor')
+ export CURRENT_LINE=$(jq -n "[$1 + $WINDOW/2 - $OFFSET, 1] | max | floor")
+ _constrain_line
+ _print
+}
+
+# @yaml
+# signature: scroll_down
+# docstring: moves the window down {WINDOW} lines
+scroll_down() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ export CURRENT_LINE=$(jq -n "$CURRENT_LINE + $WINDOW - $OVERLAP")
+ _constrain_line
+ _print
+}
+
+# @yaml
+# signature: scroll_down
+# docstring: moves the window down {WINDOW} lines
+scroll_up() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo "No file open. Use the open command first."
+ return
+ fi
+ export CURRENT_LINE=$(jq -n "$CURRENT_LINE - $WINDOW + $OVERLAP")
+ _constrain_line
+ _print
+}
+
+# @yaml
+# signature: create
+# docstring: creates and opens a new file with the given name
+# arguments:
+# filename:
+# type: string
+# description: the name of the file to create
+# required: true
+create() {
+ if [ -z "$1" ]; then
+ echo "Usage: create "
+ return
+ fi
+
+ # Check if the file already exists
+ if [ -e "$1" ]; then
+ echo "Error: File '$1' already exists."
+ open "$1"
+ return
+ fi
+
+ # Create the file an empty new line
+ printf "\n" > "$1"
+ # Use the existing open command to open the created file
+ open "$1"
+}
+
+# @yaml
+# signature: submit
+# docstring: submits your current code and terminates the session
+submit() {
+ cd $ROOT
+
+ # Check if the patch file exists and is non-empty
+ if [ -s "/root/test.patch" ]; then
+ # Apply the patch in reverse
+ git apply -R < "/root/test.patch"
+ fi
+
+ git add -A
+ git diff --cached > model.patch
+ echo "<>"
+}
\ No newline at end of file
diff --git a/config/commands/edit_linting.sh b/config/commands/edit_linting.sh
new file mode 100644
index 000000000..7c4294bba
--- /dev/null
+++ b/config/commands/edit_linting.sh
@@ -0,0 +1,130 @@
+# @yaml
+# signature: |-
+# edit :
+#
+# end_of_edit
+# docstring: replaces lines through (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again.
+# end_name: end_of_edit
+# arguments:
+# start_line:
+# type: integer
+# description: the line number to start the edit at
+# required: true
+# end_line:
+# type: integer
+# description: the line number to end the edit at (inclusive)
+# required: true
+# replacement_text:
+# type: string
+# description: the text to replace the current selection with
+# required: true
+edit() {
+ if [ -z "$CURRENT_FILE" ]
+ then
+ echo 'No file open. Use the `open` command first.'
+ return
+ fi
+
+ local start_line="$(echo $1: | cut -d: -f1)"
+ local end_line="$(echo $1: | cut -d: -f2)"
+
+ if [ -z "$start_line" ] || [ -z "$end_line" ]
+ then
+ echo "Usage: edit :"
+ return
+ fi
+
+ local re='^[0-9]+$'
+ if ! [[ $start_line =~ $re ]]; then
+ echo "Usage: edit :"
+ echo "Error: start_line must be a number"
+ return
+ fi
+ if ! [[ $end_line =~ $re ]]; then
+ echo "Usage: edit :"
+ echo "Error: end_line must be a number"
+ return
+ fi
+
+ # Bash array starts at 0, so let's adjust
+ local start_line=$((start_line - 1))
+ local end_line=$((end_line))
+
+ local line_count=0
+ local replacement=()
+ while IFS= read -r line
+ do
+ replacement+=("$line")
+ ((line_count++))
+ done
+
+ # Create a backup of the current file
+ cp "$CURRENT_FILE" "/root/$(basename "$CURRENT_FILE")_backup"
+
+ # Read the file line by line into an array
+ mapfile -t lines < "$CURRENT_FILE"
+ local new_lines=("${lines[@]:0:$start_line}" "${replacement[@]}" "${lines[@]:$((end_line))}")
+ # Write the new stuff directly back into the original file
+ printf "%s\n" "${new_lines[@]}" >| "$CURRENT_FILE"
+
+ # Run linter
+ if [[ $CURRENT_FILE == *.py ]]; then
+ lint_output=$(flake8 --select=F821,F822,F831,E111,E112,E113,E999,E902 "$CURRENT_FILE" 2>&1)
+ else
+ # do nothing
+ lint_output=""
+ fi
+
+ # if there is no output, then the file is good
+ if [ -z "$lint_output" ]; then
+ export CURRENT_LINE=$start_line
+ _constrain_line
+ _print
+
+ echo "File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary."
+ else
+ echo "Your proposed edit has introduced new syntax error(s). Please understand the fixes and retry your edit commmand."
+ echo ""
+ echo "ERRORS:"
+ _split_string "$lint_output"
+ echo ""
+
+ # Save original values
+ original_current_line=$CURRENT_LINE
+ original_window=$WINDOW
+
+ # Update values
+ export CURRENT_LINE=$(( (line_count / 2) + start_line )) # Set to "center" of edit
+ export WINDOW=$((line_count + 10)) # Show +/- 5 lines around edit
+
+ echo "This is how your edit would have looked if applied"
+ echo "-------------------------------------------------"
+ _constrain_line
+ _print
+ echo "-------------------------------------------------"
+ echo ""
+
+ # Restoring CURRENT_FILE to original contents.
+ cp "/root/$(basename "$CURRENT_FILE")_backup" "$CURRENT_FILE"
+
+ export CURRENT_LINE=$(( ((end_line - start_line + 1) / 2) + start_line ))
+ export WINDOW=$((end_line - start_line + 10))
+
+ echo "This is the original code before your edit"
+ echo "-------------------------------------------------"
+ _constrain_line
+ _print
+ echo "-------------------------------------------------"
+
+ # Restore original values
+ export CURRENT_LINE=$original_current_line
+ export WINDOW=$original_window
+
+ echo "Your changes have NOT been applied. Please fix your edit command and try again."
+ echo "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code."
+ echo "DO NOT re-run the same failed edit command. Running it again will lead to the same error."
+ fi
+
+ # Remove backup file
+ rm -f "/root/$(basename "$CURRENT_FILE")_backup"
+}
diff --git a/config/commands/search.sh b/config/commands/search.sh
new file mode 100644
index 000000000..8db25c2b1
--- /dev/null
+++ b/config/commands/search.sh
@@ -0,0 +1,155 @@
+# @yaml
+# signature: search_dir []
+# docstring: searches for search_term in all files in dir. If dir is not provided, searches in the current directory
+# arguments:
+# search_term:
+# type: string
+# description: the term to search for
+# required: true
+# dir:
+# type: string
+# description: the directory to search in (if not provided, searches in the current directory)
+# required: false
+search_dir() {
+ if [ $# -eq 1 ]; then
+ local search_term="$1"
+ local dir="./"
+ elif [ $# -eq 2 ]; then
+ local search_term="$1"
+ if [ -d "$2" ]; then
+ local dir="$2"
+ else
+ echo "Directory $2 not found"
+ return
+ fi
+ else
+ echo "Usage: search_dir []"
+ return
+ fi
+ dir=$(realpath "$dir")
+ local matches=$(find "$dir" -type f ! -path '*/.*' -exec grep -nIH "$search_term" {} + | cut -d: -f1 | sort | uniq -c)
+ # if no matches, return
+ if [ -z "$matches" ]; then
+ echo "No matches found for \"$search_term\" in $dir"
+ return
+ fi
+ # Calculate total number of matches
+ local num_matches=$(echo "$matches" | awk '{sum+=$1} END {print sum}')
+ # calculate total number of files matched
+ local num_files=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
+ # if num_files is > 100, print an error
+ if [ $num_files -gt 100 ]; then
+ echo "More than $num_files files matched for \"$search_term\" in $dir. Please narrow your search."
+ return
+ fi
+
+ echo "Found $num_matches matches for \"$search_term\" in $dir:"
+ echo "$matches" | awk '{$2=$2; gsub(/^\.+\/+/, "./", $2); print $2 " ("$1" matches)"}'
+ echo "End of matches for \"$search_term\" in $dir"
+}
+
+# @yaml
+# signature: search_file []
+# docstring: searches for search_term in file. If file is not provided, searches in the current open file
+# arguments:
+# search_term:
+# type: string
+# description: the term to search for
+# required: true
+# file:
+# type: string
+# description: the file to search in (if not provided, searches in the current open file)
+# required: false
+search_file() {
+ # Check if the first argument is provided
+ if [ -z "$1" ]; then
+ echo "Usage: search_file []"
+ return
+ fi
+ # Check if the second argument is provided
+ if [ -n "$2" ]; then
+ # Check if the provided argument is a valid file
+ if [ -f "$2" ]; then
+ local file="$2" # Set file if valid
+ else
+ echo "Usage: search_file []"
+ echo "Error: File name $2 not found. Please provide a valid file name."
+ return # Exit if the file is not valid
+ fi
+ else
+ # Check if a file is open
+ if [ -z "$CURRENT_FILE" ]; then
+ echo "No file open. Use the open command first."
+ return # Exit if no file is open
+ fi
+ local file="$CURRENT_FILE" # Set file to the current open file
+ fi
+ local search_term="$1"
+ file=$(realpath "$file")
+ # Use grep to directly get the desired formatted output
+ local matches=$(grep -nH "$search_term" "$file")
+ # Check if no matches were found
+ if [ -z "$matches" ]; then
+ echo "No matches found for \"$search_term\" in $file"
+ return
+ fi
+ # Calculate total number of matches
+ local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
+
+ # calculate total number of lines matched
+ local num_lines=$(echo "$matches" | cut -d: -f1 | sort | uniq | wc -l | awk '{$1=$1; print $0}')
+ # if num_lines is > 100, print an error
+ if [ $num_lines -gt 100 ]; then
+ echo "More than $num_lines lines matched for \"$search_term\" in $file. Please narrow your search."
+ return
+ fi
+
+ # Print the total number of matches and the matches themselves
+ echo "Found $num_matches matches for \"$search_term\" in $file:"
+ echo "$matches" | cut -d: -f1-2 | sort -u -t: -k2,2n | while IFS=: read -r filename line_number; do
+ echo "Line $line_number:$(sed -n "${line_number}p" "$file")"
+ done
+ echo "End of matches for \"$search_term\" in $file"
+}
+
+# @yaml
+# signature: find_file []
+# docstring: finds all files with the given name in dir. If dir is not provided, searches in the current directory
+# arguments:
+# file_name:
+# type: string
+# description: the name of the file to search for
+# required: true
+# dir:
+# type: string
+# description: the directory to search in (if not provided, searches in the current directory)
+# required: false
+find_file() {
+ if [ $# -eq 1 ]; then
+ local file_name="$1"
+ local dir="./"
+ elif [ $# -eq 2 ]; then
+ local file_name="$1"
+ if [ -d "$2" ]; then
+ local dir="$2"
+ else
+ echo "Directory $2 not found"
+ return
+ fi
+ else
+ echo "Usage: find_file []"
+ return
+ fi
+
+ dir=$(realpath "$dir")
+ local matches=$(find "$dir" -type f -name "$file_name")
+ # if no matches, return
+ if [ -z "$matches" ]; then
+ echo "No matches found for \"$file_name\" in $dir"
+ return
+ fi
+ # Calculate total number of matches
+ local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}')
+ echo "Found $num_matches matches for \"$file_name\" in $dir:"
+ echo "$matches" | awk '{print $0}'
+}
\ No newline at end of file
diff --git a/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml
new file mode 100644
index 000000000..80b55eecd
--- /dev/null
+++ b/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml
@@ -0,0 +1,116 @@
+system_template: |-
+ SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface.
+
+ The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time.
+ In addition to typical bash commands, you can also use the following commands to help you navigate and edit files.
+
+ COMMANDS:
+ {command_docs}
+
+ Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION.
+ If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run.
+
+ RESPONSE FORMAT:
+ Your shell prompt is formatted as follows:
+ (Open file: ) $
+
+ You need to format your output using two fields; discussion and command.
+ Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example:
+ DISCUSSION
+ First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like.
+ ```
+ ls -a
+ ```
+
+ You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference.
+ If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command.
+ You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above.
+ However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them.
+instance_template: |-
+ We're currently solving the following issue within our repository. Here's the issue text:
+ ISSUE:
+ {issue}
+
+ INSTRUCTIONS:
+ Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+ When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
+ Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`.
+
+ NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+ IMPORTANT TIPS:
+ 1. Always start by trying to replicate the bug that the issues discusses.
+ If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+ Then start trying to fix it.
+ When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+ If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+ so that you can be sure that the script indeed ran fine all the way through.
+
+ 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+ 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+ 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command.
+
+ 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
+
+ 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_template: |-
+ {observation}
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_no_output_template: |-
+ Your command ran successfully and did not produce any output.
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+demonstration_template: |
+ Here is a demonstration of how to correctly accomplish this task.
+ It is included to show you how to correctly use the interface.
+ You do not need to follow exactly what is done in the demonstration.
+ --- DEMONSTRATION ---
+ {demonstration}
+ --- END OF DEMONSTRATION ---
+state_command:
+ name: state
+ code: |
+ state() {
+ local working_dir="$PWD";
+ if [ -z $CURRENT_FILE ]; then
+ echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
+ else
+ echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
+ fi
+ };
+parse_function: ThoughtActionParser
+env_variables:
+ WINDOW: 200
+ OVERLAP: 2
+ CURRENT_LINE: 0
+ CURRENT_FILE: ''
+ SEARCH_RESULTS: ()
+ SEARCH_FILES: ()
+ SEARCH_INDEX: 0
+ START_INDEX: 0
+ END_INDEX: 0
+ START_CURSOR: 0
+ END_CURSOR: 0
+ START_CURSOR_MARK: '"<<<<< START CURSOR >>>>>"' # these have to use double quotes
+ END_CURSOR_MARK: '"<<<<< END CURSOR >>>>>"' # these have to use double quotes
+command_files:
+- config/commands/cursors_defaults.sh
+- config/commands/cursors_edit_linting.sh
+- config/commands/search.sh
+- config/commands/_split_string.py
+parse_command: ParseCommandDetailed
+history_processor: DefaultHistoryProcessor
+demonstrations:
+- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj
diff --git a/config/configs/default_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/default_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml
new file mode 100644
index 000000000..14b691814
--- /dev/null
+++ b/config/configs/default_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml
@@ -0,0 +1,110 @@
+system_template: |-
+ SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface.
+
+ The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time.
+ In addition to typical bash commands, you can also use the following commands to help you navigate and edit files.
+
+ COMMANDS:
+ {command_docs}
+
+ Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION.
+ If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run.
+
+ RESPONSE FORMAT:
+ Your shell prompt is formatted as follows:
+ (Open file: ) $
+
+ You need to format your output using two fields; discussion and command.
+ Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example:
+ DISCUSSION
+ First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like.
+ ```
+ ls -a
+ ```
+
+ You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference.
+ If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command.
+ You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above.
+ However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them.
+instance_template: |-
+ We're currently solving the following issue within our repository. Here's the issue text:
+ ISSUE:
+ {issue}
+
+ INSTRUCTIONS:
+ Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+ When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
+ Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`.
+
+ NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+ IMPORTANT TIPS:
+ 1. Always start by trying to replicate the bug that the issues discusses.
+ If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+ Then start trying to fix it.
+ When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+ If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+ so that you can be sure that the script indeed ran fine all the way through.
+
+ 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+ 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+ 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command.
+
+ 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
+
+ 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_template: |-
+ {observation}
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_no_output_template: |-
+ Your command ran successfully and did not produce any output.
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+demonstration_template: |
+ Here is a demonstration of how to correctly accomplish this task.
+ It is included to show you how to correctly use the interface.
+ You do not need to follow exactly what is done in the demonstration.
+ --- DEMONSTRATION ---
+ {demonstration}
+ --- END OF DEMONSTRATION ---
+state_command:
+ name: state
+ code: |
+ state() {
+ local working_dir="$PWD";
+ if [ -z $CURRENT_FILE ]; then
+ echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
+ else
+ echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
+ fi
+ };
+parse_function: ThoughtActionParser
+env_variables:
+ WINDOW: 100
+ OVERLAP: 2
+ CURRENT_LINE: 0
+ CURRENT_FILE: ''
+ SEARCH_RESULTS: ()
+ SEARCH_FILES: ()
+ SEARCH_INDEX: 0
+command_files:
+- config/commands/defaults.sh
+- config/commands/search.sh
+- config/commands/edit_linting.sh
+- config/commands/_split_string.py
+parse_command: ParseCommandDetailed
+history_processor: DefaultHistoryProcessor
+demonstrations:
+- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj
diff --git a/config/configs/xml_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/xml_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml
new file mode 100644
index 000000000..e68761878
--- /dev/null
+++ b/config/configs/xml_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml
@@ -0,0 +1,110 @@
+system_template: |-
+ SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface.
+
+ The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time.
+ In addition to typical bash commands, you can also use the following commands to help you navigate and edit files.
+
+ COMMANDS:
+ {command_docs}
+
+ Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION.
+ If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run.
+
+ RESPONSE FORMAT:
+ Your shell prompt is formatted as follows:
+ (Open file: ) $
+
+ You need to format your output using two fields; discussion and command.
+ Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example:
+ DISCUSSION
+ First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like.
+
+ ls -a
+
+
+ You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference.
+ If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command.
+ You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above.
+ However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them.
+instance_template: |-
+ We're currently solving the following issue within our repository. Here's the issue text:
+ ISSUE:
+ {issue}
+
+ INSTRUCTIONS:
+ Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+ When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
+ Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`.
+
+ NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+ IMPORTANT TIPS:
+ 1. Always start by trying to replicate the bug that the issues discusses.
+ If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+ Then start trying to fix it.
+ When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+ If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+ so that you can be sure that the script indeed ran fine all the way through.
+
+ 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+ 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+ 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command.
+
+ 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
+
+ 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_template: |-
+ {observation}
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_no_output_template: |-
+ Your command ran successfully and did not produce any output.
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+put_demos_in_history: true
+state_command:
+ name: state
+ code: |
+ state() {
+ local working_dir="$PWD";
+ if [ -z $CURRENT_FILE ]; then
+ echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
+ else
+ echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
+ fi
+ };
+parse_function: XMLThoughtActionParser
+env_variables:
+ WINDOW: 200
+ OVERLAP: 2
+ CURRENT_LINE: 0
+ CURRENT_FILE: ''
+ SEARCH_RESULTS: ()
+ SEARCH_FILES: ()
+ SEARCH_INDEX: 0
+ START_INDEX: 0
+ END_INDEX: 0
+ START_CURSOR: 0
+ END_CURSOR: 0
+ START_CURSOR_MARK: '"<<<<< START CURSOR >>>>>"' # these have to use double quotes
+ END_CURSOR_MARK: '"<<<<< END CURSOR >>>>>"' # these have to use double quotes
+command_files:
+- config/commands/cursors_defaults.sh
+- config/commands/cursors_edit_linting.sh
+- config/commands/search.sh
+- config/commands/_split_string.py
+parse_command: ParseCommandDetailed
+history_processor: DefaultHistoryProcessor
+demonstrations:
+- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj
diff --git a/config/configs/xml_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/xml_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml
new file mode 100644
index 000000000..00cb15c9f
--- /dev/null
+++ b/config/configs/xml_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml
@@ -0,0 +1,104 @@
+system_template: |-
+ SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface.
+
+ The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time.
+ In addition to typical bash commands, you can also use the following commands to help you navigate and edit files.
+
+ COMMANDS:
+ {command_docs}
+
+ Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION.
+ If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run.
+
+ RESPONSE FORMAT:
+ Your shell prompt is formatted as follows:
+ (Open file: ) $
+
+ You need to format your output using two fields; discussion and command.
+ Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example:
+ DISCUSSION
+ First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like.
+
+ ls -a
+
+
+ You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference.
+ If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command.
+ You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above.
+ However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them.
+instance_template: |-
+ We're currently solving the following issue within our repository. Here's the issue text:
+ ISSUE:
+ {issue}
+
+ INSTRUCTIONS:
+ Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+ When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
+ Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`.
+
+ NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+ IMPORTANT TIPS:
+ 1. Always start by trying to replicate the bug that the issues discusses.
+ If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+ Then start trying to fix it.
+ When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+ If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+ so that you can be sure that the script indeed ran fine all the way through.
+
+ 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+ 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+ 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command.
+
+ 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
+
+ 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_template: |-
+ {observation}
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_no_output_template: |-
+ Your command ran successfully and did not produce any output.
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+put_demos_in_history: true
+state_command:
+ name: state
+ code: |
+ state() {
+ local working_dir="$PWD";
+ if [ -z $CURRENT_FILE ]; then
+ echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
+ else
+ echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
+ fi
+ };
+parse_function: XMLThoughtActionParser
+env_variables:
+ WINDOW: 100
+ OVERLAP: 2
+ CURRENT_LINE: 0
+ CURRENT_FILE: ''
+ SEARCH_RESULTS: ()
+ SEARCH_FILES: ()
+ SEARCH_INDEX: 0
+command_files:
+- config/commands/defaults.sh
+- config/commands/search.sh
+- config/commands/edit_linting.sh
+- config/commands/_split_string.py
+parse_command: ParseCommandDetailed
+history_processor: DefaultHistoryProcessor
+demonstrations:
+- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj
diff --git a/config/default.yaml b/config/default.yaml
new file mode 100644
index 000000000..14b691814
--- /dev/null
+++ b/config/default.yaml
@@ -0,0 +1,110 @@
+system_template: |-
+ SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface.
+
+ The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time.
+ In addition to typical bash commands, you can also use the following commands to help you navigate and edit files.
+
+ COMMANDS:
+ {command_docs}
+
+ Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION.
+ If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run.
+
+ RESPONSE FORMAT:
+ Your shell prompt is formatted as follows:
+ (Open file: ) $
+
+ You need to format your output using two fields; discussion and command.
+ Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example:
+ DISCUSSION
+ First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like.
+ ```
+ ls -a
+ ```
+
+ You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference.
+ If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command.
+ You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above.
+ However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them.
+instance_template: |-
+ We're currently solving the following issue within our repository. Here's the issue text:
+ ISSUE:
+ {issue}
+
+ INSTRUCTIONS:
+ Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+ When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
+ Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`.
+
+ NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+ IMPORTANT TIPS:
+ 1. Always start by trying to replicate the bug that the issues discusses.
+ If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+ Then start trying to fix it.
+ When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+ If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+ so that you can be sure that the script indeed ran fine all the way through.
+
+ 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+ 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+ 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command.
+
+ 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
+
+ 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_template: |-
+ {observation}
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_no_output_template: |-
+ Your command ran successfully and did not produce any output.
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+demonstration_template: |
+ Here is a demonstration of how to correctly accomplish this task.
+ It is included to show you how to correctly use the interface.
+ You do not need to follow exactly what is done in the demonstration.
+ --- DEMONSTRATION ---
+ {demonstration}
+ --- END OF DEMONSTRATION ---
+state_command:
+ name: state
+ code: |
+ state() {
+ local working_dir="$PWD";
+ if [ -z $CURRENT_FILE ]; then
+ echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
+ else
+ echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
+ fi
+ };
+parse_function: ThoughtActionParser
+env_variables:
+ WINDOW: 100
+ OVERLAP: 2
+ CURRENT_LINE: 0
+ CURRENT_FILE: ''
+ SEARCH_RESULTS: ()
+ SEARCH_FILES: ()
+ SEARCH_INDEX: 0
+command_files:
+- config/commands/defaults.sh
+- config/commands/search.sh
+- config/commands/edit_linting.sh
+- config/commands/_split_string.py
+parse_command: ParseCommandDetailed
+history_processor: DefaultHistoryProcessor
+demonstrations:
+- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj
diff --git a/config/default_from_url.yaml b/config/default_from_url.yaml
new file mode 100644
index 000000000..30b570b8a
--- /dev/null
+++ b/config/default_from_url.yaml
@@ -0,0 +1,112 @@
+system_template: |-
+ SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface.
+
+ The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time.
+ In addition to typical bash commands, you can also use the following commands to help you navigate and edit files.
+
+ COMMANDS:
+ {command_docs}
+
+ Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION.
+ If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run.
+
+ RESPONSE FORMAT:
+ Your shell prompt is formatted as follows:
+ (Open file: ) $
+
+ You need to format your output using two fields; discussion and command.
+ Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example:
+ DISCUSSION
+ First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like.
+ ```
+ ls -a
+ ```
+
+ You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference.
+ If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command.
+ You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above.
+ However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them.
+instance_template: |-
+ We're currently solving the following issue within our repository. Here's the issue text:
+ ISSUE:
+ {issue}
+
+ INSTRUCTIONS:
+ Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
+ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command.
+ When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
+ Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`.
+
+ NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
+
+ IMPORTANT TIPS:
+ 1. Always start by trying to replicate the bug that the issues discusses.
+ If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
+ Then start trying to fix it.
+ When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
+
+ If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
+ so that you can be sure that the script indeed ran fine all the way through.
+
+ 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
+
+ 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
+
+ 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command.
+
+ 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
+
+ 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
+
+ 7. It may be necessary to install the repository from source before you can run code. Please think about how to install the environment from the repository directory if you need to do so.
+
+
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_template: |-
+ {observation}
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+next_step_no_output_template: |-
+ Your command ran successfully and did not produce any output.
+ (Open file: {open_file})
+ (Current directory: {working_dir})
+ bash-$
+demonstration_template: |
+ Here is a demonstration of how to correctly accomplish this task.
+ It is included to show you how to correctly use the interface.
+ You do not need to follow exactly what is done in the demonstration.
+ --- DEMONSTRATION ---
+ {demonstration}
+ --- END OF DEMONSTRATION ---
+state_command:
+ name: state
+ code: |
+ state() {
+ local working_dir="$PWD";
+ if [ -z $CURRENT_FILE ]; then
+ echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}';
+ else
+ echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}';
+ fi
+ };
+parse_function: ThoughtActionParser
+env_variables:
+ WINDOW: 100
+ OVERLAP: 2
+ CURRENT_LINE: 0
+ CURRENT_FILE: ''
+ SEARCH_RESULTS: ()
+ SEARCH_FILES: ()
+ SEARCH_INDEX: 0
+command_files:
+- config/commands/defaults.sh
+- config/commands/search.sh
+- config/commands/edit_linting.sh
+- config/commands/_split_string.py
+parse_command: ParseCommandDetailed
+history_processor: DefaultHistoryProcessor
+demonstrations:
+- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default__t-0.20__p-0.95__c-2.00__install-1___install_from_source/marshmallow-code__marshmallow-1867.traj
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000..c4773a76c
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,70 @@
+# Docker
+To ensure reproducibility and sandboxed execution of SWE-agent actions across systems, we adopt practices established in [prior work](https://intercode-benchmark.github.io/) and use [๐ Docker](https://www.docker.com/) containers to carry out SWE-agent inference.
+
+* The `swe.Dockerfile` file is the customized image written for the environment of SWE-agent.
+* The `./setup.sh` script automatically builds this image.
+* When `run.py` is invoked, containers are automatically created from the built image.
+ * There is no need to manually build a container from the image.
+
+Here, we explain what each line in `swe.Dockerfile` does:
+
+1. **Base Image**: Start from the latest version of the Ubuntu image.
+```bash
+FROM ubuntu:latest
+```
+2. **Build Argument**: Define a build argument `MINICONDA_URL` that will be used to specify the Miniconda installer URL during the build process.
+```bash
+ARG MINICONDA_URL
+```
+3. **Install Third-Party Tools**: Update the package lists for the Ubuntu package manager and install several essential development tools. Clean up after the installation.
+```bash
+RUN apt-get update && \
+ apt-get install -y bash gcc git jq wget g++ make && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+```
+4. **Initialize Git**: Configure global Git settings with a user email and name.
+```bash
+RUN git config --global user.email "sweagent@pnlp.org"
+RUN git config --global user.name "sweagent"
+```
+5. **Environment Variables**: Set the `ROOT` environment variable and customize the shell prompt.
+```bash
+ENV ROOT='/dev/'
+RUN prompt() { echo " > "; };
+ENV PS1="> "
+```
+6. **Create Assets for Inference**: Create two files that are used to track metadata during an episode.
+```bash
+RUN touch /root/files_to_edit.txt
+RUN touch /root/test.patch
+```
+7. **Enhance `ls` Command**: Modify the `.bashrc` file to alias the `ls` command.
+```bash
+RUN echo "alias ls='ls -F'" >> /root/.bashrc
+```
+8. Install Miniconda: Download and install Miniconda, then initialize conda with Bash support and add `conda-forge` to the channels list.
+```bash
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+RUN wget ${MINICONDA_URL} -O miniconda.sh \
+ && mkdir /root/.conda \
+ && bash miniconda.sh -b \
+ && rm -f miniconda.sh
+RUN conda --version \
+ && conda init bash \
+ && conda config --append channels conda-forge
+```
+9. **Install Python Packages**: Copy the `requirements.txt` file into the image and install the specified Python packages.
+```bash
+COPY docker/requirements.txt /root/requirements.txt
+RUN pip install -r /root/requirements.txt
+```
+10. **Set Working Directory**: Set the working directory to the root directory.
+```bash
+WORKDIR /
+```
+11. **Default Command**: Set the default command to open a Bash shell when the container starts.
+```bash
+CMD ["/bin/bash"]
+```
\ No newline at end of file
diff --git a/docker/eval.Dockerfile b/docker/eval.Dockerfile
new file mode 100644
index 000000000..50a0c55b0
--- /dev/null
+++ b/docker/eval.Dockerfile
@@ -0,0 +1,6 @@
+FROM swe-agent
+
+COPY ../evaluation/evaluation.py /evaluation.py
+RUN pip install git+https://github.com/princeton-nlp/SWE-bench.git
+RUN pip install unidiff
+CMD ["python", "/evaluation.py"]
diff --git a/docker/requirements.txt b/docker/requirements.txt
new file mode 100644
index 000000000..04cf09e82
--- /dev/null
+++ b/docker/requirements.txt
@@ -0,0 +1,3 @@
+anthropic
+config
+openai
\ No newline at end of file
diff --git a/docker/swe.Dockerfile b/docker/swe.Dockerfile
new file mode 100644
index 000000000..ecf5be09c
--- /dev/null
+++ b/docker/swe.Dockerfile
@@ -0,0 +1,44 @@
+FROM ubuntu:jammy
+
+ARG MINICONDA_URL
+
+# Install third party tools
+RUN apt-get update && \
+ apt-get install -y bash gcc git jq wget g++ make && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+# Initialize git
+RUN git config --global user.email "sweagent@pnlp.org"
+RUN git config --global user.name "sweagent"
+
+# Environment variables
+ENV ROOT='/dev/'
+RUN prompt() { echo " > "; };
+ENV PS1="> "
+
+# Create file for tracking edits, test patch
+RUN touch /root/files_to_edit.txt
+RUN touch /root/test.patch
+
+# add ls file indicator
+RUN echo "alias ls='ls -F'" >> /root/.bashrc
+
+# Install miniconda
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+RUN wget ${MINICONDA_URL} -O miniconda.sh \
+ && mkdir /root/.conda \
+ && bash miniconda.sh -b \
+ && rm -f miniconda.sh
+RUN conda --version \
+ && conda init bash \
+ && conda config --append channels conda-forge
+
+# Install python packages
+COPY docker/requirements.txt /root/requirements.txt
+RUN pip install -r /root/requirements.txt
+
+WORKDIR /
+
+CMD ["/bin/bash"]
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 000000000..87d980201
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,21 @@
+name: swe-agent
+dependencies:
+ - python=3.9
+ - pip
+ - pip:
+ - anthropic
+ - config
+ - datasets
+ - docker
+ - gnureadline
+ - gymnasium
+ - numpy
+ - openai>=1.0
+ - pandas
+ - rich
+ - ruamel.yaml
+ - swebench>=1.0.1
+ - tenacity
+ - unidiff
+ - simple-parsing
+ - together
diff --git a/evaluation/README.md b/evaluation/README.md
new file mode 100644
index 000000000..4b584fca8
--- /dev/null
+++ b/evaluation/README.md
@@ -0,0 +1,79 @@
+# Evaluation
+
+The `evaluation/` folder provides SWE-agent compatible scripts for running [SWE-bench style evaluation](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md) on model patch predictions. In addition, we also include additional scripts to quantify model performance on "subtasks" within the SWE-bench task, such as identifying the right file(s) to edit.
+
+## ๐ Table of Contents
+- [Evaluation](#evaluation)
+ - [๐ Table of Contents](#-table-of-contents)
+ - [๐ Quick Start ](#-quick-start-)
+ - [๐ช SWE-bench Evaluation ](#-swe-bench-evaluation-)
+ - [๐ Viewing Results ](#-viewing-results-)
+
+## ๐ Quick Start
+You can run evaluations on SWE-bench by passing in the predictions generated by SWE-agent (usually named `all_preds.jsonl`). Simply run the following script:
+
+```bash
+./run_eval.sh
+```
+
+Then run `./run_eval.sh`. Depending on the # of task instances and how long setting up the execution environment takes, the evaluation could take a couple minutes or to 7 hours for the entirety of the SWE-bench test split.
+
+When evaluation finishes, you should see an output similar to the following:
+```bash
+2024-03-31 16:47:00,263 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Installing with command: . /n/fs/p-swe-bench/testbed/ba397fe0d6/pvlib__pvlib-python/0.8/tmpom22t9na/miniconda3/bin/activate pvlib__pvlib-python__0.8 && echo 'activate successful' && pip install -e .[all]
+2024-03-31 16:47:10,602 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Installation successful
+2024-03-31 16:47:10,619 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Apply patch successful (test)
+2024-03-31 16:47:10,635 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Apply patch successful (pred)
+2024-03-31 16:47:13,453 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Test script run successful
+==================================
+Log directory for evaluation run: /n/fs/p-swe-bench/results/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4
+== Evaluation Report ==
+{'# Not Generated': 1, '# Generated': 36, '# Applied': 34, '# Resolved': 5}
+- Wrote per-instance scorecards to //trajectories/carlosejimenez/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4/scorecards.json
+- Wrote summary of run to //trajectories/carlosejimenez/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4/results.json
+Reference Report:
+{'# Not Generated': 1, '# Generated': 36, '# Applied': 34, '# Resolved': 5}
+```
+
+## ๐ช SWE-bench Evaluation
+`evaluation.py`: This script contains the logic for SWE-bench evaluation adapted for the SWE-agent setting. Given a set of predictions (e.g. `trajectories///all_preds.jsonl`), we...
+1. Filter + analyze predictions.
+2. Run SWE-bench style execution based evaluation.
+3. Save outcomes to `results.json` and `scorecards.json` files with info about task-specific and overall performance.
+
+> `run_eval.sh` is provided as an example of how to run `evaluation.py`
+
+Arguments:
+* `--predictions_path (required)`: The path to the file containing predictions (.jsonl format). This file includes the predictions that need to be evaluated against the benchmark tasks.
+* `--log_dir (required)`: The directory path where log files related to the evaluation process will be stored. It's used for saving logs that are generated during the evaluation.
+* `--swe_bench_tasks (required)`: The path to the file containing the SWE-bench task instances. This file includes the details of the tasks against which the predictions will be evaluated.
+* `--testbed (required)`: The directory path for the testbed, which is likely used for setting up the environment or context for the evaluations.
+* `--skip_existing (optional)`: If specified, the script will skip over log files that already exist, preventing re-evaluation of those tasks.
+* `--timeout (optional)`: Specifies the timeout in seconds for the evaluation process (default is 900 seconds). This helps in controlling the duration of each evaluation task to avoid excessively long running times.
+* `--verbose (optional)`: Enables verbose mode, which will provide more detailed output during the script execution. This is useful for debugging or getting more insight into the process.
+* `--conda_link (optional)`: Allows specifying a URL to a Conda installation that should be used for the evaluation environment. This can be necessary if the evaluation requires a specific software environment.
+* `--log_suffix (optional)`: An additional parameter to specify a suffix for log files. This can be used for organizing logs more effectively, especially when running multiple evaluations in parallel or under different configurations.
+
+## ๐ Viewing Results
+`aggregate_results.py`: This script aggregates and displays experiment results from the `trajectories/` folder.
+* Experiments are grouped by `(Model, Dataset, Config File, Temp., Top P, Cost, Install)`.
+* The following statistics for each experiment run are shown:
+ * `Not Generated`: # of task instances with no patch generated
+ * `Generated`: # of task instances with patch
+ * `Applied`: # of patches that applied successfully
+ * `Resolved`: # of task instances resolved
+ * `Costs [Success|Failed|Overall]`: Cost of [successful|failed|any] run
+* If there are multiple runs of an experiment (distinguished by `--suffix run`), the above statistics are aggregate as totals or means.
+
+Usage:
+```
+python aggregate_results.py
+```
+
+Arguments:
+* `--folder (type: str, default: ../trajectories)`: Specifies the folder containing the experiment * results. This is where the script will look to gather data.
+* `--model (type: str, nargs: '+')`: Filters the results by model(s). Only results corresponding to the * specified model(s) will be included.
+* `--dataset (type: str, nargs: '+')`: Filters the results by dataset(s). Only results for the specified * dataset(s) will be analyzed.
+* `--setup (type: str, nargs: '+')`: Filters the results by setup(s). This allows focusing on specific * experiment configurations.
+* `--runs_min (type: int)`: The minimum number of runs an experiment should have to be included in the * analysis. Helps exclude experiments with insufficient data.
+* `--runs_max (type: int)`: The maximum number of runs to consider for each experiment. This can limit the data to the most relevant runs.
\ No newline at end of file
diff --git a/evaluation/aggregate_results.py b/evaluation/aggregate_results.py
new file mode 100644
index 000000000..35978d7ad
--- /dev/null
+++ b/evaluation/aggregate_results.py
@@ -0,0 +1,219 @@
+import argparse
+import glob
+import json
+import numpy as np
+import os
+import pandas as pd
+import warnings
+warnings.filterwarnings("ignore")
+
+from pathlib import Path
+from rich import print
+
+COLUMNS = [
+ "Model",
+ "Dataset",
+ "Setup",
+ "Temp.",
+ "Top P",
+ "Cost",
+ "Install",
+ "Run",
+ "Not Generated",
+ "Generated",
+ "Applied",
+ "Resolved",
+ "Resolved IDs",
+ "Costs Success",
+ "Costs Failure",
+ "Costs Overall",
+]
+
+
+def get_folders(path):
+ return [entry for entry in Path(path).iterdir() if entry.is_dir()]
+
+
+def parse_folder_name(folder_name):
+ """
+ Parse the folder name to get the different parts
+ """
+ parsed_folder = folder_name.split("__")
+ if len(parsed_folder) == 7:
+ parsed_folder.append("")
+ return parsed_folder
+
+
+def convert_experiments_to_rows(folder_name, runs_max):
+ """
+ Convert each experiment to a row in the csv
+ """
+ rows = []
+ directories = get_folders(folder_name)
+ for directory in directories:
+ folders = get_folders(directory)
+ for folder in folders:
+ # Skip debug folders
+ if "debug" in folder.name:
+ continue
+
+ # Skip fine tuned models
+ if "ft_gpt-3.5" in folder.name:
+ continue
+
+ # Skip folders without a results.json file
+ json_file = folder / "results.json"
+ if not json_file.exists():
+ # print(f"No json file in {folder}")
+ continue
+
+ # Extract run attributes
+ folder_data = parse_folder_name(folder.name)
+ model = folder_data[0]
+ dataset = folder_data[1]
+ if dataset.startswith("swe-bench-dev-easy-"):
+ dataset = dataset[len("swe-bench-dev-easy-") :]
+ elif dataset.startswith("swe-bench-dev-"):
+ dataset = dataset[len("swe-bench-dev-") :]
+ setup = folder_data[2]
+ if len(folder_data) != 8:
+ # TODO: This might be too strict?
+ continue
+ temperature = float(folder_data[3][len("t-"):].strip())
+ top_p = float(folder_data[4][len("p-"):].strip())
+ cost = float(folder_data[5][len("c-"):].strip())
+ install = "Y" if folder_data[6].strip() == "install-1" else "N"
+
+ # Parse out run number
+ run = folder_data[-1]
+ if "run" not in run:
+ continue
+
+ try:
+ if "run-" in run:
+ run = int(run.split("run-")[-1].split("-")[0].replace("_", "").strip())
+ else:
+ run = int(run.split("run")[-1].split("-")[0].replace("_", "").strip())
+ except Exception as e:
+ print(run)
+ raise e
+
+ if runs_max is not None and run > runs_max:
+ continue
+
+ # Load results.json file
+ with json_file.open() as file:
+ results_data = json.load(file)
+ report = results_data.get("report", {})
+
+ # Extract resolved ids (to calculate pass@k)
+ resolved_ids = []
+ if "resolved" in results_data and isinstance(results_data["resolved"], list):
+ resolved_ids = results_data["resolved"]
+ elif "counts" in results_data and isinstance(results_data["counts"]["resolved"], list):
+ resolved_ids = results_data["counts"]["resolved"]
+
+ # Extract instance costs from trajectories
+ costs_overall = []
+ costs_success = []
+ costs_failure = []
+ for x in glob.glob(os.path.join(str(folder), "*.traj")):
+ traj_data = json.load(open(x))
+ if "model_stats" not in traj_data["info"]:
+ continue
+ run_cost = traj_data["info"]["model_stats"]["instance_cost"]
+ inst_id = x.split("/")[-1].split(".")[0]
+ costs_overall.append(run_cost)
+ if inst_id in resolved_ids:
+ costs_success.append(run_cost)
+ else:
+ costs_failure.append(run_cost)
+
+ # Create run row, write to csv
+ rows.append(
+ [
+ model,
+ dataset,
+ setup,
+ temperature,
+ top_p,
+ cost,
+ install,
+ run,
+ report.get("# Not Generated", 0),
+ report.get("# Generated", 0),
+ report.get("# Applied", 0),
+ report.get("# Resolved", 0),
+ resolved_ids,
+ costs_success,
+ costs_failure,
+ costs_overall,
+ ]
+ )
+
+ return rows
+
+
+def get_results_df(folder_name, runs_max):
+ rows = convert_experiments_to_rows(folder_name, runs_max)
+ return (
+ pd.DataFrame(rows, columns=COLUMNS)
+ .sort_values(by=COLUMNS[:8])
+ )
+
+
+def get_results_csv(folder_name):
+ get_results_df(folder_name).to_csv("results.csv")
+ print("Experiment results written to results.csv")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Aggregate results from experiments")
+ parser.add_argument("--folder", type=str, help="Folder containing experiment results", default="../trajectories")
+ parser.add_argument("--model", nargs='+', type=str, help="Model(s) to filter results by.")
+ parser.add_argument("--dataset", nargs='+', type=str, help="Dataset to filter results by.")
+ parser.add_argument("--setup", nargs='+', type=str, help="Setup to filter results by.")
+ parser.add_argument("--runs_min", type=int, help="Minimum number of runs that experiment should have been run for.")
+ parser.add_argument("--runs_max", type=int, help="Maximum number of runs taken into account")
+ args = parser.parse_args()
+
+ df = get_results_df(args.folder, args.runs_max)
+
+ grouped_data = (
+ df.groupby(COLUMNS[:7])
+ .agg(
+ {
+ "Run": "count", # Count the number of runs
+ "Not Generated": "mean",
+ "Generated": "mean",
+ "Applied": "mean",
+ "Resolved": "mean",
+ "Resolved IDs": lambda x: len(set([item for sublist in x for item in sublist])),
+ "Costs Success": lambda x: np.mean([item for sublist in x for item in sublist]),
+ "Costs Failure": lambda x: np.mean([item for sublist in x for item in sublist]),
+ "Costs Overall": lambda x: np.mean([item for sublist in x for item in sublist]),
+ }
+ )
+ .round(2)
+ .reset_index()
+ .rename(columns={"Resolved IDs": "Pass@K", "Run": "Runs"})
+ )
+
+ # Filtering
+ if args.model:
+ grouped_data = grouped_data[grouped_data['Model'].isin(args.model)]
+ if args.dataset:
+ grouped_data = grouped_data[grouped_data['Dataset'].isin(args.dataset)]
+ if args.setup:
+ grouped_data = grouped_data[grouped_data['Setup'].isin(args.setup)]
+ if args.runs_min:
+ grouped_data = grouped_data[grouped_data['Run'] >= args.runs_min]
+
+ print(f"Total experiments run: {grouped_data.shape[0]}")
+ grouped_data_sorted = grouped_data.sort_values(by=['Dataset', 'Resolved'], ascending=[True, False])
+ pd.set_option("display.max_rows", None)
+ grouped = grouped_data_sorted.groupby('Dataset')
+
+ for name, group in grouped:
+ print(f'\n-----------------\nDataset: {name}\n-----------------')
+ print(group.to_string(index=False))
diff --git a/evaluation/evaluation.py b/evaluation/evaluation.py
new file mode 100644
index 000000000..31800ba91
--- /dev/null
+++ b/evaluation/evaluation.py
@@ -0,0 +1,269 @@
+import argparse
+import json
+import os
+import traceback
+
+from datasets import load_dataset, load_from_disk
+from collections import Counter
+from rich import print
+from swebench import (
+ KEY_INSTANCE_ID,
+ KEY_MODEL,
+ KEY_PREDICTION,
+ get_eval_report,
+ get_logs_eval,
+ get_model_report,
+ get_resolution_status,
+ run_evaluation,
+ get_eval_refs,
+)
+from swebench.harness.constants import (
+ INSTALL_FAIL,
+)
+from unidiff import PatchSet
+
+
+def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, timeout, verbose, conda_link, log_suffix, num_processes):
+ # Check if paths exist
+ if not os.path.exists(predictions_path):
+ raise FileNotFoundError(f"Predictions path {predictions_path} does not exist")
+ eval_refs = get_eval_refs(swe_bench_tasks)
+ for k, v in eval_refs.items():
+ eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]}
+
+ # Change model_name_or_patch field to directory name for all predictions
+ directory = os.path.dirname(predictions_path)
+ directory_name = directory.rsplit("/", 1)[-1]
+ pred_path_orig = predictions_path
+ pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl")
+
+ pred_total, pred_will_eval = 0, 0
+ with open(pred_path_temp, "w") as f:
+ for l in open(pred_path_orig, "r").readlines():
+ pred_total += 1
+ p = json.loads(l)
+ # Exclude predictions w/ empty strings
+ if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "":
+ p[KEY_MODEL] = directory_name
+ json.dump(p, f)
+ f.write("\n")
+ pred_will_eval += 1
+ print(
+ f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
+ )
+
+ # Run evaluation
+ predictions_path = pred_path_temp
+ try:
+ print("๐ Beginning evaluation...")
+ run_evaluation(
+ predictions_path=predictions_path,
+ log_dir=log_dir,
+ swe_bench_tasks=swe_bench_tasks,
+ testbed=testbed,
+ skip_existing=skip_existing,
+ timeout=timeout,
+ verbose=verbose,
+ conda_link=conda_link,
+ log_suffix=log_suffix,
+ num_processes=num_processes
+ )
+ print("โ Finished evaluation")
+ except Exception as e:
+ print(f"โ Evaluation failed: {e}\n{traceback.format_exc()}")
+ pass
+ print("==================================")
+ os.remove(pred_path_temp)
+
+ # Get predictions, define log_dir
+ predictions = [json.loads(l) for l in open(pred_path_orig, "r").readlines()]
+ log_dir = os.path.join(log_dir, directory_name)
+ print(f"Log directory for evaluation run: {log_dir}")
+
+ # Iterate through predictions
+ scorecards = []
+ for p in predictions:
+ scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}}
+
+ # Add trajectory statistics if traj_path exists
+ traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj")
+ if os.path.exists(traj_path):
+ traj_data = json.load(open(traj_path, "r"))
+ scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"])
+ scorecard["stats"]["traj_action_dist"] = dict(
+ Counter(
+ [
+ entry["action"].strip().split()[0]
+ if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0
+ else None
+ for entry in traj_data["history"]
+ ]
+ )
+ )
+ scorecard["exit_status"] = (
+ traj_data["info"]["exit_status"]
+ if "exit_status" in traj_data["info"]
+ else "n/a"
+ )
+
+ # Check that a prediction was generated
+ if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "":
+ scorecard["statuses"].append("not_generated")
+ scorecards.append(scorecard)
+ continue
+ scorecard["statuses"].append("generated")
+
+ # Get log file
+ log_path = os.path.join(
+ log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log"
+ )
+ if not os.path.exists(log_path):
+ scorecard["statuses"].append("build_failure")
+ scorecards.append(scorecard)
+ continue
+
+ # Get evaluation logs
+ eval_sm, found = get_logs_eval(log_path)
+
+ # Check that the prediction generated
+ if not found:
+ scorecards.append(scorecard)
+ continue
+ scorecard["statuses"].append("applied")
+
+ with open(log_path, "r") as f:
+ log_contents = f.read()
+ if INSTALL_FAIL in log_contents:
+ scorecard["statuses"].append("install_fail")
+
+ # Get resolution status
+ report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
+ scorecard["test_results"] = {
+ "failure": {
+ "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"],
+ "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"],
+ },
+ "success": {
+ "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"],
+ "PASS_TO_PASS": report["PASS_TO_PASS"]["success"],
+ }
+ }
+ resolution_status = get_resolution_status(report)
+ scorecard["statuses"].append(resolution_status)
+
+ diff_obj = PatchSet(p[KEY_PREDICTION])
+ scorecard["patch_files"] = [
+ x.path
+ for x in diff_obj.modified_files
+ + diff_obj.added_files
+ + diff_obj.removed_files
+ ]
+ scorecard["patch_lines_add"] = sum([f.added for f in diff_obj])
+ scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj])
+ scorecards.append(scorecard)
+
+ # Calculate cumulative results
+ get_ids_with_status = lambda x: [
+ s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"]
+ ]
+ report = {
+ "# Not Generated": len(get_ids_with_status("not_generated")),
+ "# Generated": len(get_ids_with_status("generated")),
+ "# Applied": len(get_ids_with_status("applied")),
+ "# Resolved": len(get_ids_with_status("RESOLVED_FULL")),
+ "# Install Fail": len(get_ids_with_status("install_fail")),
+ }
+ print(f"== Evaluation Report ==\n{report}")
+
+ report_exits = dict(
+ Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards])
+ )
+
+ # Save to summary, scorecard json
+ path_scorecards = os.path.join(directory, "scorecards.json")
+ with open(path_scorecards, "w") as f:
+ json.dump(scorecards, fp=f, indent=2)
+ print(f"- Wrote per-instance scorecards to {path_scorecards}")
+
+ path_results = os.path.join(directory, "results.json")
+ with open(path_results, "w") as f:
+ json.dump(
+ {
+ "report": report,
+ "report_exits": report_exits,
+ "not_generated": get_ids_with_status("not_generated"),
+ "generated": get_ids_with_status("generated"),
+ "applied": get_ids_with_status("applied"),
+ "resolved": get_ids_with_status("RESOLVED_FULL"),
+ "install_fail": get_ids_with_status("install_fail"),
+ },
+ fp=f,
+ indent=2,
+ )
+ print(f"- Wrote summary of run to {path_results}")
+
+ # Sanity check against get_model_report
+ report = get_model_report(
+ directory_name, pred_path_orig, swe_bench_tasks, log_dir
+ )
+ by_outcome = {}
+ by_outcome_func = lambda status: len(
+ [
+ instance_id
+ for _, v in report.items()
+ if isinstance(v, dict)
+ for instance_id in v[status]
+ ]
+ )
+ by_outcome["# Not Generated"] = by_outcome_func("none")
+ by_outcome["# Generated"] = by_outcome_func("generated")
+ by_outcome["# Applied"] = by_outcome_func("applied")
+ by_outcome["# Resolved"] = by_outcome_func("resolved")
+ by_outcome["# Install Fail"] = by_outcome_func("install_fail")
+ print(f"Reference Report:\n{by_outcome}")
+
+
+if __name__ == "__main__":
+ # Parse arguments
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--predictions_path",
+ type=str,
+ help="Path to predictions file (.jsonl)",
+ required=True,
+ )
+ parser.add_argument(
+ "--log_dir", type=str, help="Path to log directory", required=True
+ )
+ parser.add_argument(
+ "--swe_bench_tasks",
+ type=str,
+ help="Path to SWE-bench task instances file",
+ required=True,
+ )
+ parser.add_argument(
+ "--testbed", type=str, help="Path to testbed directory", required=True
+ )
+ parser.add_argument(
+ "--skip_existing", action="store_true", help="(Optional) Skip existing logs"
+ )
+ parser.add_argument(
+ "--timeout",
+ type=int,
+ help="(Optional) Timeout in seconds (default: 900)",
+ default=900,
+ )
+ parser.add_argument(
+ "--verbose", action="store_true", help="(Optional) Verbose mode"
+ )
+ parser.add_argument(
+ "--conda_link", default=None, type=str, help="(Optional) URL to conda installation to use"
+ )
+ parser.add_argument(
+ "--log_suffix", default=None, type=str, help="(Optional) Log suffix"
+ )
+ parser.add_argument(
+ "--num_processes", default=-1, type=int, help="Num processes"
+ )
+ args = parser.parse_args()
+ main(**vars(args))
diff --git a/evaluation/run_eval.sh b/evaluation/run_eval.sh
new file mode 100755
index 000000000..c22c10942
--- /dev/null
+++ b/evaluation/run_eval.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# The first positional argument
+predictions_path=$1
+
+# Check if predictions_path is not provided
+if [ -z "$predictions_path" ]; then
+ echo "Usage: $0 [dataset_name_or_path] [results_dir] [testbed_dir]"
+ exit 1
+fi
+
+# Default values for the optional arguments
+dataset_name_or_path="${2:-princeton-nlp/SWE-bench}"
+results_dir="${3:-results}"
+testbed_dir="${4:-testbed}"
+
+# If results or testbed directories do not exist, create them
+if [ ! -d "$results_dir" ]; then
+ mkdir -p "$results_dir"
+ echo "Created results directory at $results_dir"
+fi
+
+if [ ! -d "$testbed_dir" ]; then
+ mkdir -p "$testbed_dir"
+ echo "Created testbed directory at $testbed_dir"
+fi
+
+# Run the Python script with the specified arguments
+python evaluation.py \
+ --predictions_path "$predictions_path" \
+ --swe_bench_tasks "$dataset_name_or_path" \
+ --log_dir "$results_dir" \
+ --testbed "$testbed_dir" \
+ --skip_existing \
+ --timeout 900 \
+ --verbose
diff --git a/inspector/README.md b/inspector/README.md
new file mode 100644
index 000000000..7981b2c3c
--- /dev/null
+++ b/inspector/README.md
@@ -0,0 +1,23 @@
+# Inspector
+We provide a web interface for visualizing `.traj` files from the `trajectories` folder more easily.
+
+**Set Up**
+* Run `python server.py trajectories`
+* Open http://localhost:8000 in your browser to use the inspector.
+
+**Additional flags**
+- `--data_path`: Path to SWE-bench style dataset that trajectories were generated for (Optional)
+- `--directory`: Directory of trajectories to inspect (Defaults to `./trajectories` folder)
+- `--port`: Port to host web app (Defaults to `8000`).
+
+**Example Usage**
+
+From running the command:
+```
+python server.py --directory trajectories/carlosejimenez/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4
+```
+The inspector will then be launched in the browser:
+
+
+
+
diff --git a/inspector/__init__.py b/inspector/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/inspector/favicon.ico b/inspector/favicon.ico
new file mode 100644
index 000000000..362b4e1b2
Binary files /dev/null and b/inspector/favicon.ico differ
diff --git a/inspector/fileViewer.js b/inspector/fileViewer.js
new file mode 100644
index 000000000..7cc9896c8
--- /dev/null
+++ b/inspector/fileViewer.js
@@ -0,0 +1,133 @@
+let currentFileName = null; // Store the current file name
+let trajectoryDirectory = ''; // Global variable to store the directory
+let timeoutIds = []; // Store timeout IDs for pending operations
+
+
+function getBaseUrl() {
+ const protocol = window.location.protocol;
+ const host = window.location.hostname;
+ const port = window.location.port;
+
+ // Use the default port if the port number is empty (for standard HTTP/HTTPS)
+ const defaultPort = (protocol === 'http:' && !port) ? '80' : (protocol === 'https:' && !port) ? '443' : port;
+
+ return `${protocol}//${host}:${defaultPort}`;
+}
+
+
+function fetchFiles() {
+ const baseUrl = getBaseUrl();
+ fetch(`${baseUrl}/files`)
+ .then(response => response.json())
+ .then(files => {
+ const fileList = document.getElementById('fileList');
+ fileList.innerHTML = '';
+ files.forEach(file => {
+ const fileElement = document.createElement('li');
+ fileElement.textContent = file;
+ fileElement.onclick = () => viewFile(file.split(' ')[0]);
+ fileList.appendChild(fileElement);
+ });
+ });
+}
+
+function viewFile(fileName) {
+ // Clear any pending message loading from previous files
+ timeoutIds.forEach(timeoutId => clearTimeout(timeoutId));
+ timeoutIds = []; // Reset the list of timeout IDs
+
+ const baseUrl = getBaseUrl();
+ fetch(`${baseUrl}/trajectory/${fileName}`)
+ .then(response => {
+ if (!response.ok) {
+ throw new Error('Network response was not ok');
+ }
+ return response.json();
+ })
+ .then(content => {
+ const container = document.getElementById('fileContent');
+ container.innerHTML = ''; // Clear existing content
+
+ if (content.history && Array.isArray(content.history)) {
+ let delay = 200; // Initial delay
+ const delayIncrement = 50; // Delay between each message, in milliseconds
+
+ content.history.forEach((item, index) => {
+ const timeoutId = setTimeout(() => {
+ const contentText = item.content ? item.content.replace(//g, '>') : '';
+ let roleClass = item.agent && item.agent !== "primary" ? "subroutine" : item.role ? item.role.toLowerCase().replaceAll(' ', '-') : 'default';
+ const elementId = 'historyItem' + index;
+ const historyItem = document.createElement('div');
+ historyItem.className = `history-item ${roleClass} fade-in`;
+ historyItem.id = elementId;
+ if (contentText.includes("--- DEMONSTRATION ---")) {
+ item.role = "demo";
+ }
+ else if ('is_demo' in item && item.is_demo === true) {
+ item.role += '[demo]';
+ }
+ historyItem.innerHTML = `
+
+
+ ${item.role}
+
+
+
+
${contentText}
+
+
+ `;
+ container.appendChild(historyItem);
+ }, delay);
+
+ delay += delayIncrement; // Increment delay for the next message
+ timeoutIds.push(timeoutId); // Store the timeout ID
+ });
+ } else {
+ container.textContent = 'No history content found.';
+ }
+ })
+ .catch(error => {
+ console.error('Error fetching file:', error);
+ document.getElementById('fileContent').textContent = 'Error loading content. ' + error;
+ });
+
+ // Highlight the selected file in the list
+ document.querySelectorAll('#fileList li').forEach(li => {
+ li.classList.remove('selected');
+ if (li.textContent.split(' ')[0] === fileName) {
+ li.classList.add('selected');
+ }
+ });
+}
+
+function refreshCurrentFile() {
+ if (currentFileName) {
+ const currentScrollPosition = document.documentElement.scrollTop || document.body.scrollTop;
+ viewFile(currentFileName.split(' ')[0]); // Reload the current file
+ // Restore the scroll position after the content is loaded
+ setTimeout(() => {
+ window.scrollTo(0, currentScrollPosition);
+ }, 100);
+ }
+}
+
+function fetchDirectoryInfo() {
+ const baseUrl = getBaseUrl();
+ fetch(`${baseUrl}/directory_info`)
+ .then(response => response.json())
+ .then(data => {
+ if (data.directory) {
+ trajectoryDirectory = data.directory; // Store the directory
+ document.title = `Trajectory Viewer: ${data.directory}`;
+ document.querySelector('h1').textContent = `Trajectory Viewer: ${data.directory}`;
+ }
+ })
+ .catch(error => console.error('Error fetching directory info:', error));
+}
+
+
+window.onload = function() {
+ fetchFiles();
+ fetchDirectoryInfo();
+};
\ No newline at end of file
diff --git a/inspector/icons/computer.png b/inspector/icons/computer.png
new file mode 100644
index 000000000..11a27054b
Binary files /dev/null and b/inspector/icons/computer.png differ
diff --git a/inspector/icons/edit_icon.svg b/inspector/icons/edit_icon.svg
new file mode 100644
index 000000000..11bcba7d7
--- /dev/null
+++ b/inspector/icons/edit_icon.svg
@@ -0,0 +1,11 @@
+
diff --git a/inspector/icons/swe-agent-logo-50.png b/inspector/icons/swe-agent-logo-50.png
new file mode 100644
index 000000000..4ff29d077
Binary files /dev/null and b/inspector/icons/swe-agent-logo-50.png differ
diff --git a/inspector/icons/swellama_blue.png b/inspector/icons/swellama_blue.png
new file mode 100644
index 000000000..4797ff1a6
Binary files /dev/null and b/inspector/icons/swellama_blue.png differ
diff --git a/inspector/icons/swellama_brown.png b/inspector/icons/swellama_brown.png
new file mode 100644
index 000000000..abf131d1b
Binary files /dev/null and b/inspector/icons/swellama_brown.png differ
diff --git a/inspector/icons/swellama_grey.png b/inspector/icons/swellama_grey.png
new file mode 100644
index 000000000..349ed0909
Binary files /dev/null and b/inspector/icons/swellama_grey.png differ
diff --git a/inspector/icons/swellama_tan.png b/inspector/icons/swellama_tan.png
new file mode 100644
index 000000000..0910a08be
Binary files /dev/null and b/inspector/icons/swellama_tan.png differ
diff --git a/inspector/index.html b/inspector/index.html
new file mode 100644
index 000000000..a5613bff9
--- /dev/null
+++ b/inspector/index.html
@@ -0,0 +1,18 @@
+
+
+ Trajectory Viewer
+
+
+
+
+
+
Trajectory File Viewer
+
+
Conversation History
+
No file selected.
+
+
+
+
+
+
diff --git a/inspector/server.py b/inspector/server.py
new file mode 100644
index 000000000..81910683b
--- /dev/null
+++ b/inspector/server.py
@@ -0,0 +1,304 @@
+import http.server
+import json
+import os
+import socketserver
+import traceback
+import yaml
+
+from pathlib import Path
+from argparse import ArgumentParser
+from functools import partial
+
+
+def append_exit(content):
+ last_entry = content["history"][-1]
+ if last_entry["role"] == "system":
+ return content
+
+ exit_status = content.get("info", {}).get("exit_status", None)
+
+ if exit_status is None:
+ return content
+
+ if exit_status.startswith("submitted"):
+ if "submission" in content["info"]:
+ submission = content["info"]["submission"]
+ content["history"].append({
+ "role": "model_patch",
+ "content": submission,
+ })
+ # else submission should be in history already
+ else:
+ raise ValueError("No submission in history or info")
+ # elif content.get("info", {}).get("exit_status", None) is not None:
+ # content["history"].append({
+ # "role": "system",
+ # "content": f"Exited - {content['info']['exit_status']}",
+ # })
+ return content
+
+
+def append_patch(instance_id, content, patches, patch_type):
+ if content.get("info", {}).get("exit_status", None) is not None:
+ if instance_id in patches:
+ content["history"].append({
+ "role": f"{patch_type} Patch",
+ "content": patches[instance_id],
+ })
+ return content
+
+
+def append_results(traj_path, instance_id, content, results, results_file, scorecards, scorecards_file):
+ stats = []
+ model_stats = {}
+ if traj_path.exists():
+ data = json.loads(traj_path.read_text())
+ info = data.get("info", {})
+ model_stats = info.get("model_stats", {})
+ instance_cost = model_stats.get("instance_cost", None)
+ instance_cost = f'{instance_cost:.2f}' if instance_cost is not None else 'N/A'
+ tokens_sent = model_stats.get("tokens_sent", None)
+ tokens_sent = f'{tokens_sent:,}' if tokens_sent is not None else 'N/A'
+ tokens_received = model_stats.get("tokens_received", None)
+ tokens_received = f'{tokens_received:,}' if tokens_received is not None else 'N/A'
+ api_calls = model_stats.get("api_calls", None)
+ api_calls = f'{api_calls:,}' if api_calls is not None else 'N/A'
+ stats.append(f"**** Run Stats ****")
+ stats.append(f"Instance Cost: ${instance_cost}")
+ stats.append(f"Tokens Sent: {tokens_sent}")
+ stats.append(f"Tokens Received: {tokens_received}")
+ stats.append(f"API Calls: {api_calls}\n")
+ status = []
+ if results is None:
+ status.append("Evaluation results not found")
+ elif "not_generated" in results and "generated" in results and "applied" in results and "resolved" in results:
+ is_generated = instance_id in results["generated"]
+ is_applied = instance_id in results["applied"]
+ is_resolved = instance_id in results["resolved"]
+
+ status.append("**** Statuses ****")
+ status.append(
+ f" {'โ ' if is_generated else 'โ'} Generated (The agent was {'' if is_generated else 'not '}"
+ "able to generate a pull request to address this issue)")
+ status.append(
+ f" {'โ ' if is_applied else 'โ'} Applied (The pull request was {'' if is_applied else 'not '}"
+ "successfully applied to the repo during eval)")
+ status.append(
+ f" {'โ ' if is_resolved else 'โ'} Resolved (The pull request {'' if is_resolved else 'not '}"
+ "successfully resolved the issue during eval)")
+ else:
+ status.append("Results format not recognized")
+
+ if scorecards is not None:
+ scorecard = [x for x in scorecards if x["instance_id"] == instance_id][0]
+ if "test_results" in scorecard and "failure" in scorecard["test_results"] and (
+ len(scorecard["test_results"]["failure"]["FAIL_TO_PASS"]) > 0 or
+ len(scorecard["test_results"]["failure"]["PASS_TO_PASS"]) > 0
+ ):
+ tests_failing = [
+ f" - {x}" for x in scorecard["test_results"]["failure"]["FAIL_TO_PASS"]
+ ] + [
+ f" - {x}" for x in scorecard["test_results"]["failure"]["PASS_TO_PASS"]
+ ]
+ status.extend(["", "**** Test Results ****", "๐งช Tests Failed"] + tests_failing[:7])
+ if len(tests_failing) > 7:
+ status.append(f" ... and {len(tests_failing) - 7} more")
+ status.append("")
+
+ if status == []:
+ status.append("Instance not found in results")
+ else:
+ status.append("---------------------------")
+ status.append("Note that the evaluation results here may not be accurate or up to date, since they are computed seperately from the agent run itself.")
+ results_relative = results_file.resolve().relative_to(Path(__file__).resolve().parent.parent)
+ status.append(f"Check {results_relative} for the most accurate evaluation results.")
+ status.append("")
+ status.append(f"Instance ID: {instance_id}")
+ status.append("Based on results:")
+ status.append(json.dumps(results, indent=4))
+ eval_report = {
+ "role": "Evaluation Report",
+ "content": "\n".join([*stats, *status]),
+ }
+ content["history"].insert(0, eval_report)
+ content["history"].append(eval_report)
+ return content
+
+
+def load_content(file_name, gold_patches, test_patches):
+ with open(file_name) as infile:
+ content = json.load(infile)
+ results_file = Path(file_name).parent / "results.json"
+ results = None
+ if results_file.exists():
+ with open(results_file) as infile:
+ results = json.load(infile)
+
+ scorecards_file = Path(file_name).parent / "scorecards.json"
+ scorecards = None
+ if scorecards_file.exists():
+ with open(scorecards_file) as infile:
+ scorecards = json.load(infile)
+
+ content = append_exit(content) # accomodate new and old format
+ content = append_patch(Path(file_name).stem, content, gold_patches, "Gold")
+ content = append_patch(Path(file_name).stem, content, test_patches, "Test")
+ content = append_results(
+ Path(file_name),
+ Path(file_name).stem,
+ content,
+ results,
+ results_file,
+ scorecards,
+ scorecards_file,
+ )
+ return content
+
+
+def load_results(traj_path):
+ results_file = Path(traj_path).parent / "results.json"
+ if results_file.exists():
+ with open(results_file) as infile:
+ return json.load(infile)
+ return None
+
+
+def get_status(traj_path):
+ results = load_results(traj_path)
+ instance_id = Path(traj_path).stem
+ if results is None:
+ return "โ"
+ elif "not_generated" in results and "generated" in results and "applied" in results and "resolved" in results:
+ if instance_id in results["not_generated"]:
+ return "โ"
+ if instance_id in results["generated"]:
+ if instance_id in results["resolved"]:
+ return "โ "
+ else:
+ return "โ"
+ return "โ"
+
+
+class Handler(http.server.SimpleHTTPRequestHandler):
+ file_mod_times = {} # Dictionary to keep track of file modification times
+
+ def __init__(self, *args, **kwargs):
+ self.gold_patches = {}
+ self.test_patches = {}
+ if "gold_patches" in kwargs:
+ self.gold_patches = kwargs.pop("gold_patches")
+ if "test_patches" in kwargs:
+ self.test_patches = kwargs.pop("test_patches")
+ self.traj_dir = kwargs.pop('directory', '.') # Extract directory
+ super().__init__(*args, **kwargs)
+
+ def serve_directory_info(self):
+ self.send_response(200)
+ self.send_header('Content-type', 'application/json')
+ self.end_headers()
+ self.wfile.write(json.dumps({"directory": self.traj_dir}).encode())
+
+ def serve_file_content(self, file_path):
+ try:
+ content = load_content(
+ Path(self.traj_dir) / file_path,
+ self.gold_patches,
+ self.test_patches,
+ )
+ self.send_response(200)
+ self.send_header('Content-type', 'text/plain')
+ self.end_headers()
+ self.wfile.write(json.dumps(content).encode())
+ except FileNotFoundError:
+ self.send_error(404, f"File {file_path} not found")
+
+ def do_GET(self):
+ if self.path == '/directory_info':
+ self.serve_directory_info()
+ elif self.path.startswith('/files'):
+ self.handle_files_request()
+ elif self.path.startswith('/trajectory/'):
+ file_path = self.path[len('/trajectory/'):]
+ self.serve_file_content(file_path)
+ elif self.path.startswith('/check_update'):
+ self.check_for_updates()
+ else:
+ super().do_GET()
+
+ def handle_files_request(self):
+ self.send_response(200)
+ self.send_header('Content-type', 'application/json')
+ self.end_headers()
+ files = sorted(
+ [
+ str(file.relative_to(Path(self.traj_dir))) + " " * 4 + get_status(file)
+ for file in Path(self.traj_dir).glob('**/*.traj')
+ ],
+ key=lambda x: str(Path(self.traj_dir) / x), reverse=True
+ )
+ self.wfile.write(json.dumps(files).encode())
+
+ def check_for_updates(self):
+ current_mod_times = {str(file): os.path.getmtime(file) for file in Path(self.traj_dir).glob('**/*.traj')}
+ if current_mod_times != Handler.file_mod_times:
+ Handler.file_mod_times = current_mod_times
+ self.send_response(200) # Send response that there's an update
+ else:
+ self.send_response(204) # Send no content response if no update
+ self.end_headers()
+
+ def end_headers(self):
+ self.send_header('Access-Control-Allow-Origin', '*')
+ super().end_headers()
+
+
+def main(data_path, directory, port):
+ data = []
+ if data_path is not None:
+ if data_path.endswith(".jsonl"):
+ data = [json.loads(x) for x in open(data_path).readlines()]
+ elif data_path.endswith(".json"):
+ data = json.load(open(data_path))
+ elif "args.yaml" in os.listdir(directory):
+ args = yaml.safe_load(open(os.path.join(directory, "args.yaml")))
+ if "environment" in args and "data_path" in args["environment"]:
+ data_path = os.path.join(
+ Path(__file__).parent, "..",
+ args["environment"]["data_path"]
+ )
+ if os.path.exists(data_path):
+ data = json.load(open(data_path, "r"))
+
+ gold_patches = {
+ d["instance_id"]: d["patch"]
+ if "patch" in d else None for d in data
+ }
+ test_patches = {
+ d["instance_id"]: d["test_patch"]
+ if "test_patch" in d else None for d in data
+ }
+
+ handler_with_directory = partial(
+ Handler,
+ directory=directory,
+ gold_patches=gold_patches,
+ test_patches=test_patches,
+ )
+ try:
+ with socketserver.TCPServer(("", port), handler_with_directory) as httpd:
+ print(f"Serving at http://localhost:{port}")
+ httpd.serve_forever()
+ except OSError as e:
+ if e.errno == 48:
+ print(f"ERROR: Port ({port}) is already in use. Try another port with the --port flag.")
+ else:
+ raise e
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser()
+ parser.add_argument("--data_path", type=str, help="Path to dataset that was used for the trajectories")
+ parser.add_argument("--directory", type=str, help="Directory to serve", default="./trajectories", nargs='?')
+ parser.add_argument("--port", type=int, help="Port to serve", default=8000)
+ args = parser.parse_args()
+ main(**vars(args))
diff --git a/inspector/static.py b/inspector/static.py
new file mode 100644
index 000000000..6288ad18d
--- /dev/null
+++ b/inspector/static.py
@@ -0,0 +1,169 @@
+import json
+import logging
+import yaml
+from pathlib import Path
+from tqdm.auto import tqdm
+from argparse import ArgumentParser
+try:
+ from .server import load_content
+except ImportError:
+ from server import load_content
+
+
+logger = logging.getLogger(__name__)
+logging.getLogger("simple_parsing").setLevel(logging.INFO)
+
+
+TEMPLATE = """
+
+
+ Trajectory Viewer
+
+
+
+