diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..3e85e0983 --- /dev/null +++ b/.gitignore @@ -0,0 +1,189 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Mac files +*.DS_Store + +# Custom +keys.cfg + +# iPython Notebooks +*.ipynb + +# Evaluation folders +results/ +testbed/ +temp/ + +# Ignore all YAML files in data/ +data/*/ic-* +data/*/single-issues + +# Fine tuning data +fine_tune/*.ipynb +fine_tune/subtasks/*.jsonl +temp*.jsonl + +# Inspector +inspector/*.json + +# Ignore all files in the private folder +private/ + +### Website + +# dependencies +website/frontend/node_modules +website/frontend/package-lock.json +website/frontend/.pnp +*.pnp.js + +# testing +website/frontend/coverage + +# production +website/frontend/build + +# misc +*.env.local +*.env.development.local +*.env.test.local +*.env.production.local +.api_key +*npm-debug.log* +*yarn-debug.log* +*yarn-error.log* + + +# demo yamls (for editing) +*.demo.yaml + +# trajectory files +trajectories/* diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..e702436e2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 John Yang, Carlos E. Jimenez, Alexander Wettig, Shunyu Yao, Karthik Narasimhan, Ofir Press + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 000000000..1fd5d174d --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +

+ + swe-agent.com + +

+ + +

+ Website & Demo  |   + Discord  |   + Paper [coming April 10th] +

+ + +## ๐Ÿ‘‹ Overview +SWE-agent turns LMs (e.g. GPT-4) into software engineering agents that can fix bugs and issues in real GitHub repositories. + +On the full [SWE-bench](https://github.com/princeton-nlp/SWE-bench) test set, SWE-agent resolves **12.29%** of issues, achieving the state-of-the-art performance on the full test set. + +### โœจ Agent-Computer Interface (ACI) +We accomplish these results by designing simple LM-centric commands and feedback formats to make it easier for the LM to browse the repository, view, edit and execute code files. We call this an **Agent-Computer Interface** (ACI) and build the SWE-agent repository to make it easy to iterate on ACI design for repository-level coding agents. + +Just like how typical language models requires good prompt engineering, good ACI design leads to much better results when using agents. As we show in our paper, a baseline agent without a well-tuned ACI does much worse than SWE-agent. + +SWE-agent contains features that we discovered to be immensly helpful during the agent-computer interface design process: +1. We add a linter that runs when an edit command is issued, and do not let the edit command go through if the code isn't syntactically correct. +2. We supply the agent with a special-built file viewer, instead of having it just ```cat``` files. We found that this file viewer works best when displaying just 100 lines in each turn. The file editor that we built has commands for scrolling up and down and for performing a search within the file. +3. We supply the agent with a special-built full-directory string searching command. We found that it was important for this tool to succintly list the matches- we simply list each file that had at least one match. Showing the model more context about each match proved to be too confusing for the model. +4. When commands have an empty output we return a message saying "Your command ran successfully and did not produce any output." + +Read our paper for more details. + +``` +@misc{yang2024sweagent, + title={SWE-agent: Agent Computer Interfaces Enable Software Engineering Language Models}, + author={John Yang and Carlos E. Jimenez and Alexander Wettig and Shunyu Yao and Karthik Narasimhan and Ofir Press}, + year={2024}, +} +``` + +## ๐Ÿš€ Setup +1. [Install Docker](https://docs.docker.com/engine/install/), then start Docker locally. +2. [Install Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/), then create the `swe-agent` environment with `conda env create -f environment.yml` +3. Activate using `conda activate swe-agent`. +4. Run `./setup.sh` to create the `swe-agent` docker image. +5. Create a `keys.cfg` file at the root of this repository and fill in the following: +``` +OPENAI_API_KEY: 'OpenAI API Key Here if using OpenAI Model (optional)' +ANTHROPIC_API_KEY: 'Anthropic API Key Here if using Anthropic Model (optional)' +GITHUB_TOKEN: 'GitHub Token Here (required)' +``` +See the following links for tutorials on obtaining [Anthropic](https://docs.anthropic.com/claude/reference/getting-started-with-the-api), [OpenAI](https://platform.openai.com/docs/quickstart/step-2-set-up-your-api-key), and [Github]() tokens. + +## ๐Ÿ’ฝ Usage +There are two steps to the SWE-agent pipeline. First SWE-agent takes an input GitHub issue and returns a pull request that attempts to fix it. We call that step *inference*. The second step (currently, only available for issues in the SWE-bench benchmark) is to *evaluate* the pull request to verify that it has indeed fixed the issue. + +### ๐Ÿ‘ฉโ€๐Ÿ’ป Inference +**Inference on *any* GitHub Issue**: Using this script, you can run SWE-agent on any GitHub issue! +``` +python run.py --model_name gpt4 \ + --data_path https://github.com/pvlib/pvlib-python/issues/1603 --config_file config/default_from_url.yaml +``` + +**Inference on SWE-bench**: Run SWE-agent on [SWE-bench Lite](https://www.swebench.com/lite.html) and generate patches. +``` +python run.py --model_name gpt4 \ + --per_instance_cost_limit 2.00 \ + --config_file ./config/default.yaml +``` + +If you'd like to run on a *single* issue from SWE-bench, use the `--instance_filter` option as follows: +``` +python run.py --model_name gpt4 \ + --instance_filter marshmallow-code__marshmallow-1359 +``` +* See the [`scripts/`](scripts/) folder for other useful scripts and details. +* See the [`config/`](config/) folder for details about how you can define your own configuration! +* See the [`swe-agent/agent/`](agent/) folder for details about the logic behind configuration based workflows. +* See the [`swe-agent/environment/`](swe-agent/environment/) folder for details about the `SWEEnv` environment (interface + implementation). +* See the [`trajectories/`](trajectories) folder for details about the output of `run.py`. + +### ๐Ÿงช Evaluation +This step is only available for issues from the SWE-bench set. To evaluate generated pull requests: +``` +cd evaluation/ +./run_eval.sh +``` +Replace `` with the path to the model's predictions, which should be generated from the *Inference* step. The `` arguments should look like `../trajectories//--/all_preds.jsonl` +* See the [`evaluation/`](evaluation/) folder for details about how evaluation works. + + +## ๐Ÿ’ซ Contributions +- If you'd like to ask questions, learn about upcoming features, and participate in future development, join our [Discord community](https://discord.gg/AVEFbBn2rH)! +- If you'd like to contribute to the codebase, we welcome [issues](https://github.com/princeton-nlp/SWE-agent/issues) and [pull requests](https://github.com/princeton-nlp/SWE-agent/pulls)! +- If you'd like to see a post or tutorial about some topic, please let us know via an [issue](https://github.com/princeton-nlp/SWE-agent/issues). + +## ๐Ÿชช License +MIT. Check `LICENSE`. diff --git a/assets/inspector.png b/assets/inspector.png new file mode 100644 index 000000000..7a071c543 Binary files /dev/null and b/assets/inspector.png differ diff --git a/assets/swe-agent-banner.png b/assets/swe-agent-banner.png new file mode 100644 index 000000000..09474356e Binary files /dev/null and b/assets/swe-agent-banner.png differ diff --git a/build_deploy.sh b/build_deploy.sh new file mode 100755 index 000000000..6b46d126a --- /dev/null +++ b/build_deploy.sh @@ -0,0 +1,6 @@ +# !bin/bash + +python3 -m build + +python3 -m twine upload --skip-existing --repository pypi dist/* +# python3 -m twine upload --skip-existing --repository testpypi dist/* diff --git a/config/README.md b/config/README.md new file mode 100644 index 000000000..17719f21e --- /dev/null +++ b/config/README.md @@ -0,0 +1,78 @@ +# Configuration + +This folder contains details describing how to write your own configurations to control how agents can interact with the `SWEEnv` environment. +A configuration is represented as a single `.yaml` file, allowing you to... +* Define the **commands** that agents may use to traverse + modify a codebase. +* Write **prompts** that are determiniscally/conditionally shown to the agent over the course of a single trajectory. +* Control the **input/output interface** that sits between the agent and `SWEEnv`. + +## Configuration File Fields +The configuration is a `.yaml` file that consists of several fields. They are fully represented in this following outline: + +```yaml +# Prompt Templates: Control how observations of environment are shown to agent +system_template: | # .yaml syntax for multi-line string value + First `system` message shown to agent +instance_template: |- # .yaml syntax for multi-line string value w/ no new line + Instance prompt, contains task instance-specific content +next_step_template: |- + Format template of per-turn observation (Contains standard output from agent's action) +next_step_no_output_template: |- + Format template of observation when there is no standard output from the agent's action +format_error_template: |- + Format template of error message (Used when agent's action causes an error) +demonstration_template: | + Format template for showing a demonstration to the agent +demonstrations: +- `trajectories///*.traj` +- File is a demonstration of how to solve a task. This could an agent generated trajectory. +- You can include 1+ demonstrations + +# Environment States: Define features of the SWEEnv environment +env_variables: +# Default variables for SWEEnv at the beginning of each instance + CURRENT_FILE: 0 + CURRENT_LINE: + OVERLAP: + SEARCH_FILES: + SEARCH_INDEX: + SEARCH_RESULTS: + WINDOW_SIZE: + START_INDEX: + END_INDEX: + START_CURSOR: + END_CUROSR: + START_CURSORS_MARK: + END_CURSOR_MARK: +state_command: | +# `state_command` allows you to update state variables to reflect any aspect of the environment (e.g. current working directory) + name: state + code: | + state() { echo '{"pwd": "'$PWD'"}'; + +# Action Interface: Define how an agent interacts with the SWEEnv environment +command_files: +- path/to/bash_file.sh +- Each file contains a list of commands implemented in bash +- You can include 1+ command files +parse_command: Reference to functionality for defining command documentation +history_processor: Reference to functionality for controlling agent's message history +parse_function: Parser run on agent output +``` + +We recommend looking at... +* `configs/` for examples of properly formatted configuration files. Each configuration differs in its set of commands, input/output format, demonstrations, etc. +* `commands/` for the bash implementations of the custom commands that SWE-agent uses to navigate + edit the codebase. + +## How a Configuration File is Processed +Some notes on processing that occurs on config fields when SWE-agent is run: +* Commands specified in `command_files` will be parsed into a single block of documentation text that can be referenced as `{command_docs}`. +* `env_variables` are the default variables for the bash environment at the beginning of each instance. +* `state_command` is used to extract state information from the bash environment (formatted as json) to be used in the templates given to the agent. + +Possible variables that can be used in templates are: +- `{command_docs}` (an automatically compiled collection of available commands + their docstrings) +- any variable given in `env_variables` (same spelling), e.g., `{WINDOW_SIZE}` +- any variable extracted as json as part of the `state_command` function +- the last observation `{observation}` +- ... this list will grow as we implement more features! \ No newline at end of file diff --git a/config/commands/_split_string.py b/config/commands/_split_string.py new file mode 100644 index 000000000..3ceccc15e --- /dev/null +++ b/config/commands/_split_string.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import sys + +def print_flake8_output(input_string, show_line_numbers=False): + for value in input_string.split("\n"): + parts = value.split() + if not show_line_numbers: + print(f"- {' '.join(parts[1:])}") + else: + line_nums = ":".join(parts[0].split(":")[1:]) + print(f"- {line_nums} {' '.join(parts[1:])}") + +if __name__ == "__main__": + lint_output = sys.argv[1] + print_flake8_output(lint_output) diff --git a/config/commands/cursors_defaults.sh b/config/commands/cursors_defaults.sh new file mode 100644 index 000000000..f6168b134 --- /dev/null +++ b/config/commands/cursors_defaults.sh @@ -0,0 +1,306 @@ +_reset_cursors() { + export START_CURSOR=1 + export END_CURSOR=1 +} + +_constrain_cursors() { + # constrain the cursors to be within the bounds of the file [0, total_lines+1] + local total_lines=$(awk 'END {print NR}' "$CURRENT_FILE") + total_lines=$((total_lines < 1 ? 1 : total_lines)) # if the file is empty, set total_lines to 1 + local start_line=$((CURRENT_LINE - WINDOW / 2)) + local end_line=$((CURRENT_LINE + WINDOW / 2)) + start_line=$((start_line < 1 ? 1 : start_line)) + end_line=$((end_line > total_lines ? total_lines : end_line)) + local warning_string="" + if [ "$START_CURSOR" -lt "$start_line" ]; then + warning_string+="START_CURSOR moved to $start_line\n" + START_CURSOR=$start_line + elif [ "$START_CURSOR" -gt "$end_line" ]; then + START_CURSOR=$end_line + warning_string+="START_CURSOR moved to $end_line\n" + fi + if [ "$END_CURSOR" -lt "$start_line" ]; then + warning_string+="END_CURSOR moved to $start_line\n" + END_CURSOR=$start_line + elif [ "$END_CURSOR" -gt "$end_line" ]; then + warning_string+="END_CURSOR moved to $end_line\n" + END_CURSOR=$end_line + fi + export START_CURSOR END_CURSOR + echo "$warning_string" + echo $START_CURSOR $END_CURSOR +} + +_print() { + local cursor_warning=$(_constrain_cursors) + local cursor_values=$(echo "$cursor_warning" | tail -n 1) + cursor_warning=$(echo "$cursor_warning" | head -n -1) + export START_CURSOR=$(echo "$cursor_values" | awk '{print $1}') + export END_CURSOR=$(echo "$cursor_values" | awk '{print $2}') + local total_lines=$(awk 'END {print NR}' $CURRENT_FILE) + echo "[File: $(realpath "$CURRENT_FILE") ($total_lines lines total)]" + local start_line=$((CURRENT_LINE - WINDOW / 2)) + local end_line=$((CURRENT_LINE + WINDOW / 2)) + start_line=$((start_line < 1 ? 1 : start_line)) + end_line=$((end_line > total_lines ? total_lines : end_line)) + local lines=() + local i=0 + while IFS= read -r line; do + lines[i++]="$line" + done < <(awk -v start="$start_line" -v end="$end_line" 'NR>=start && NR<=end {print}' "$CURRENT_FILE") + local num_lines=${#lines[@]} + if [ $start_line -gt 1 ]; then + echo "($((start_line - 1)) more lines above)" + fi + for ((i=0; i +# docstring: sets the start and end cursors to the given line numbers +# arguments: +# start_line: +# type: integer +# description: the line number to set the start cursor to +# required: true +# end_line: +# type: integer +# description: the line number to set the end cursor to +# required: true +set_cursors() { + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + if [ $# -lt 2 ] + then + echo "Usage: set_cursors " + return + fi + local start_line=$1 + local end_line=$2 + local re='^[0-9]+$' + if ! [[ $start_line =~ $re ]] + then + echo "Usage: set_cursors " + echo "Error: start_line must be a number" + return + fi + if ! [[ $end_line =~ $re ]] + then + echo "Usage: set_cursors " + echo "Error: end_line must be a number" + return + fi + if [ $start_line -gt $end_line ] + then + echo "Usage: set_cursors " + echo "Error: start_line must be less than or equal to end_line" + return + fi + export START_CURSOR=$start_line + export END_CURSOR=$end_line + _print +} + +# @yaml +# signature: open [] +# docstring: opens the file at the given path in the editor. If line_number is provided, the window will be centered on that line +# arguments: +# path: +# type: string +# description: the path to the file to open +# required: true +# line_number: +# type: integer +# description: the line number to move the window to (if not provided, the window will start at the top of the file) +# required: false +open() { + if [ -z "$1" ] + then + echo "Usage: open " + return + fi + # Check if the second argument is provided + if [ -n "$2" ]; then + # Check if the provided argument is a valid number + if ! [[ $2 =~ ^[0-9]+$ ]]; then + echo "Usage: open []" + echo "Error: must be a number" + return # Exit if the line number is not valid + fi + local max_line=$(awk 'END {print NR}' $1) + if [ $2 -gt $max_line ]; then + echo "Warning: ($2) is greater than the number of lines in the file ($max_line)" + echo "Warning: Setting to $max_line" + local line_number=$(jq -n "$max_line") # Set line number to max if greater than max + elif [ $2 -lt 1 ]; then + echo "Warning: ($2) is less than 1" + echo "Warning: Setting to 1" + local line_number=$(jq -n "1") # Set line number to 1 if less than 1 + else + local line_number=$(jq -n "$2") # Set line number if valid + fi + else + local line_number=$(jq -n "$WINDOW/2") # Set default line number if not provided + fi + + if [ -f "$1" ]; then + export CURRENT_FILE=$(realpath $1) + export CURRENT_LINE=$line_number + _constrain_line + _print + else + echo "File $1 not found" + fi +} + +# @yaml +# signature: scroll_down +# docstring: moves the window down {WINDOW} lines +scroll_down() { + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + export CURRENT_LINE=$(jq -n "$CURRENT_LINE + $WINDOW - $OVERLAP") + _constrain_line + _print +} + +# @yaml +# signature: scroll_up +# docstring: moves the window up {WINDOW} lines +scroll_up() { + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + export CURRENT_LINE=$(jq -n "$CURRENT_LINE - $WINDOW + $OVERLAP") + _constrain_line + _print +} + +# @yaml +# signature: goto +# docstring: moves the window to show +# arguments: +# line_number: +# type: integer +# description: the line number to move the window to +# required: true +goto() { + if [ $# -gt 1 ]; then + echo "goto allows only one line number at a time." + return + fi + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + if [ -z "$1" ] + then + echo "Usage: goto " + return + fi + if ! [[ $1 =~ ^[0-9]+$ ]] + then + echo "Usage: goto " + echo "Error: must be a number" + return + fi + local max_line=$(awk 'END {print NR}' $CURRENT_FILE) + if [ $1 -gt $max_line ] + then + echo "Error: must be less than or equal to $max_line" + return + fi + local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor') + export CURRENT_LINE=$(jq -n "[$1 + $WINDOW/2 - $OFFSET, 1] | max | floor") + _constrain_line + _print +} + +# @yaml +# signature: create +# docstring: creates and opens a new file with the given name +# arguments: +# filename: +# type: string +# description: the name of the file to create +# required: true +create() { + if [ -z "$1" ]; then + echo "Usage: create " + return + fi + + # Check if the file already exists + if [ -e "$1" ]; then + echo "Error: File '$1' already exists." + open "$1" + return + fi + + # Create the file an empty new line + printf "\n" > "$1" + # Use the existing open command to open the created file + open "$1" +} + +# @yaml +# signature: submit +# docstring: submits your current code and terminates the session +submit() { + cd $ROOT + + # Check if the patch file exists and is non-empty + if [ -s "/root/test.patch" ]; then + # Apply the patch in reverse + git apply -R < "/root/test.patch" + fi + + git add -A + git diff --cached > model.patch + echo "<>" +} \ No newline at end of file diff --git a/config/commands/cursors_edit_linting.sh b/config/commands/cursors_edit_linting.sh new file mode 100644 index 000000000..4357043ac --- /dev/null +++ b/config/commands/cursors_edit_linting.sh @@ -0,0 +1,101 @@ +# @yaml +# signature: |- +# edit +# +# end_of_edit +# docstring: replaces *all* of the text between the START CURSOR and the END CURSOR with the replacement_text. The replacement text is terminated by a line with only end_of_edit on it. All of the will be entered, so make sure your indentation is formatted properly. To enter text at the beginning of the file, set START CURSOR and END CURSOR to 0. Use set_cursors to move the cursors around. Python files will be checked for syntax errors after the edit. +# end_name: end_of_edit +# arguments: +# replacement_text: +# type: string +# description: the text to replace the current selection with +# required: true +edit() { + if [ -z "$CURRENT_FILE" ] + then + echo 'No file open. Use the `open` command first.' + return + fi + local start_line=$((START_CURSOR - 1)) + start_line=$((start_line < 0 ? 0 : start_line)) + local end_line=$((END_CURSOR)) + end_line=$((end_line < 0 ? 0 : end_line)) + + local replacement=() + while IFS= read -r line + do + replacement+=("$line") + done + + local num_lines=${#replacement[@]} + # Create a backup of the current file + cp "$CURRENT_FILE" "/root/$(basename "$CURRENT_FILE")_backup" + # Read the file line by line into an array + mapfile -t lines < "$CURRENT_FILE" + local new_lines=("${lines[@]:0:$start_line}" "${replacement[@]}" "${lines[@]:$((end_line))}") + # Write the new stuff directly back into the original file + printf "%s\n" "${new_lines[@]}" >| "$CURRENT_FILE" + # Run linter + if [[ $CURRENT_FILE == *.py ]]; then + lint_output=$(flake8 --select=F821,F822,F831,E111,E112,E113,E999,E902 "$CURRENT_FILE" 2>&1) + else + # do nothing + lint_output="" + fi + # if there is no output, then the file is good + if [ -z "$lint_output" ]; then + _constrain_line + # set to START + num_lines - 1, unless num_lines is 0, then set to START + export END_CURSOR=$((num_lines == 0 ? START_CURSOR : START_CURSOR + num_lines - 1)) + export START_CURSOR=$START_CURSOR + _print + echo "File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary." + else + echo "Your proposed edit has introduced new syntax error(s). Please understand the fixes and retry your edit commmand." + echo "" + echo "ERRORS:" + _split_string "$lint_output" + echo "" + + # Save original values + original_current_line=$CURRENT_LINE + original_window=$WINDOW + original_end_cursor=$END_CURSOR + + # Update values + export CURRENT_LINE=$(( (num_lines / 2) + start_line )) # Set to "center" of edit + export WINDOW=$((num_lines + 10)) # Show +/- 5 lines around edit + export END_CURSOR=$((num_lines == 0 ? START_CURSOR : START_CURSOR + num_lines - 1)) + + echo "This is how your edit would have looked if applied" + echo "-------------------------------------------------" + _constrain_line + _print + echo "-------------------------------------------------" + echo "" + + # Restoring CURRENT_FILE to original contents. + cp "/root/$(basename "$CURRENT_FILE")_backup" "$CURRENT_FILE" + + export CURRENT_LINE=$(( ((end_line - start_line) / 2) + start_line )) # Set to "center" of edit + export WINDOW=$((end_line - start_line + 10)) + export END_CURSOR=$original_end_cursor + + echo "This is the original code before your edit" + echo "-------------------------------------------------" + _constrain_line + _print + echo "-------------------------------------------------" + + # Restore original values + export CURRENT_LINE=$original_current_line + export WINDOW=$original_window + export END_CURSOR=$original_end_cursor + + echo "Your changes have NOT been applied. Please fix your edit command and try again." + echo "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code." + echo "DO NOT re-run the same failed edit command. Running it again will lead to the same error." + fi + # Remove backup file + rm -f "/root/$(basename "$CURRENT_FILE")_backup" +} diff --git a/config/commands/defaults.sh b/config/commands/defaults.sh new file mode 100644 index 000000000..7294a732b --- /dev/null +++ b/config/commands/defaults.sh @@ -0,0 +1,195 @@ +_print() { + local total_lines=$(awk 'END {print NR}' $CURRENT_FILE) + echo "[File: $(realpath $CURRENT_FILE) ($total_lines lines total)]" + lines_above=$(jq -n "$CURRENT_LINE - $WINDOW/2" | jq '[0, .] | max | floor') + lines_below=$(jq -n "$total_lines - $CURRENT_LINE - $WINDOW/2" | jq '[0, .] | max | round') + if [ $lines_above -gt 0 ]; then + echo "($lines_above more lines above)" + fi + cat $CURRENT_FILE | grep -n $ | head -n $(jq -n "[$CURRENT_LINE + $WINDOW/2, $WINDOW/2] | max | floor") | tail -n $(jq -n "$WINDOW") + if [ $lines_below -gt 0 ]; then + echo "($lines_below more lines below)" + fi +} + +_constrain_line() { + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + local max_line=$(awk 'END {print NR}' $CURRENT_FILE) + local half_window=$(jq -n "$WINDOW/2" | jq 'floor') + export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $max_line - $half_window] | min") + export CURRENT_LINE=$(jq -n "[$CURRENT_LINE, $half_window] | max") +} + +# @yaml +# signature: open [] +# docstring: opens the file at the given path in the editor. If line_number is provided, the window will be move to include that line +# arguments: +# path: +# type: string +# description: the path to the file to open +# required: true +# line_number: +# type: integer +# description: the line number to move the window to (if not provided, the window will start at the top of the file) +# required: false +open() { + if [ -z "$1" ] + then + echo "Usage: open " + return + fi + # Check if the second argument is provided + if [ -n "$2" ]; then + # Check if the provided argument is a valid number + if ! [[ $2 =~ ^[0-9]+$ ]]; then + echo "Usage: open []" + echo "Error: must be a number" + return # Exit if the line number is not valid + fi + local max_line=$(awk 'END {print NR}' $1) + if [ $2 -gt $max_line ]; then + echo "Warning: ($2) is greater than the number of lines in the file ($max_line)" + echo "Warning: Setting to $max_line" + local line_number=$(jq -n "$max_line") # Set line number to max if greater than max + elif [ $2 -lt 1 ]; then + echo "Warning: ($2) is less than 1" + echo "Warning: Setting to 1" + local line_number=$(jq -n "1") # Set line number to 1 if less than 1 + else + local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor') + local line_number=$(jq -n "[$2 + $WINDOW/2 - $OFFSET, 1] | max | floor") + fi + else + local line_number=$(jq -n "$WINDOW/2") # Set default line number if not provided + fi + + if [ -f "$1" ]; then + export CURRENT_FILE=$(realpath $1) + export CURRENT_LINE=$line_number + _constrain_line + _print + elif [ -d "$1" ]; then + echo "Error: $1 is a directory. You can only open files. Use cd or ls to navigate directories." + else + echo "File $1 not found" + fi +} + +# @yaml +# signature: goto +# docstring: moves the window to show +# arguments: +# line_number: +# type: integer +# description: the line number to move the window to +# required: true +goto() { + if [ $# -gt 1 ]; then + echo "goto allows only one line number at a time." + return + fi + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + if [ -z "$1" ] + then + echo "Usage: goto " + return + fi + if ! [[ $1 =~ ^[0-9]+$ ]] + then + echo "Usage: goto " + echo "Error: must be a number" + return + fi + local max_line=$(awk 'END {print NR}' $CURRENT_FILE) + if [ $1 -gt $max_line ] + then + echo "Error: must be less than or equal to $max_line" + return + fi + local OFFSET=$(jq -n "$WINDOW/6" | jq 'floor') + export CURRENT_LINE=$(jq -n "[$1 + $WINDOW/2 - $OFFSET, 1] | max | floor") + _constrain_line + _print +} + +# @yaml +# signature: scroll_down +# docstring: moves the window down {WINDOW} lines +scroll_down() { + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + export CURRENT_LINE=$(jq -n "$CURRENT_LINE + $WINDOW - $OVERLAP") + _constrain_line + _print +} + +# @yaml +# signature: scroll_down +# docstring: moves the window down {WINDOW} lines +scroll_up() { + if [ -z "$CURRENT_FILE" ] + then + echo "No file open. Use the open command first." + return + fi + export CURRENT_LINE=$(jq -n "$CURRENT_LINE - $WINDOW + $OVERLAP") + _constrain_line + _print +} + +# @yaml +# signature: create +# docstring: creates and opens a new file with the given name +# arguments: +# filename: +# type: string +# description: the name of the file to create +# required: true +create() { + if [ -z "$1" ]; then + echo "Usage: create " + return + fi + + # Check if the file already exists + if [ -e "$1" ]; then + echo "Error: File '$1' already exists." + open "$1" + return + fi + + # Create the file an empty new line + printf "\n" > "$1" + # Use the existing open command to open the created file + open "$1" +} + +# @yaml +# signature: submit +# docstring: submits your current code and terminates the session +submit() { + cd $ROOT + + # Check if the patch file exists and is non-empty + if [ -s "/root/test.patch" ]; then + # Apply the patch in reverse + git apply -R < "/root/test.patch" + fi + + git add -A + git diff --cached > model.patch + echo "<>" +} \ No newline at end of file diff --git a/config/commands/edit_linting.sh b/config/commands/edit_linting.sh new file mode 100644 index 000000000..7c4294bba --- /dev/null +++ b/config/commands/edit_linting.sh @@ -0,0 +1,130 @@ +# @yaml +# signature: |- +# edit : +# +# end_of_edit +# docstring: replaces lines through (inclusive) with the given text in the open file. The replacement text is terminated by a line with only end_of_edit on it. All of the will be entered, so make sure your indentation is formatted properly. Python files will be checked for syntax errors after the edit. If the system detects a syntax error, the edit will not be executed. Simply try to edit the file again, but make sure to read the error message and modify the edit command you issue accordingly. Issuing the same command a second time will just lead to the same error message again. +# end_name: end_of_edit +# arguments: +# start_line: +# type: integer +# description: the line number to start the edit at +# required: true +# end_line: +# type: integer +# description: the line number to end the edit at (inclusive) +# required: true +# replacement_text: +# type: string +# description: the text to replace the current selection with +# required: true +edit() { + if [ -z "$CURRENT_FILE" ] + then + echo 'No file open. Use the `open` command first.' + return + fi + + local start_line="$(echo $1: | cut -d: -f1)" + local end_line="$(echo $1: | cut -d: -f2)" + + if [ -z "$start_line" ] || [ -z "$end_line" ] + then + echo "Usage: edit :" + return + fi + + local re='^[0-9]+$' + if ! [[ $start_line =~ $re ]]; then + echo "Usage: edit :" + echo "Error: start_line must be a number" + return + fi + if ! [[ $end_line =~ $re ]]; then + echo "Usage: edit :" + echo "Error: end_line must be a number" + return + fi + + # Bash array starts at 0, so let's adjust + local start_line=$((start_line - 1)) + local end_line=$((end_line)) + + local line_count=0 + local replacement=() + while IFS= read -r line + do + replacement+=("$line") + ((line_count++)) + done + + # Create a backup of the current file + cp "$CURRENT_FILE" "/root/$(basename "$CURRENT_FILE")_backup" + + # Read the file line by line into an array + mapfile -t lines < "$CURRENT_FILE" + local new_lines=("${lines[@]:0:$start_line}" "${replacement[@]}" "${lines[@]:$((end_line))}") + # Write the new stuff directly back into the original file + printf "%s\n" "${new_lines[@]}" >| "$CURRENT_FILE" + + # Run linter + if [[ $CURRENT_FILE == *.py ]]; then + lint_output=$(flake8 --select=F821,F822,F831,E111,E112,E113,E999,E902 "$CURRENT_FILE" 2>&1) + else + # do nothing + lint_output="" + fi + + # if there is no output, then the file is good + if [ -z "$lint_output" ]; then + export CURRENT_LINE=$start_line + _constrain_line + _print + + echo "File updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary." + else + echo "Your proposed edit has introduced new syntax error(s). Please understand the fixes and retry your edit commmand." + echo "" + echo "ERRORS:" + _split_string "$lint_output" + echo "" + + # Save original values + original_current_line=$CURRENT_LINE + original_window=$WINDOW + + # Update values + export CURRENT_LINE=$(( (line_count / 2) + start_line )) # Set to "center" of edit + export WINDOW=$((line_count + 10)) # Show +/- 5 lines around edit + + echo "This is how your edit would have looked if applied" + echo "-------------------------------------------------" + _constrain_line + _print + echo "-------------------------------------------------" + echo "" + + # Restoring CURRENT_FILE to original contents. + cp "/root/$(basename "$CURRENT_FILE")_backup" "$CURRENT_FILE" + + export CURRENT_LINE=$(( ((end_line - start_line + 1) / 2) + start_line )) + export WINDOW=$((end_line - start_line + 10)) + + echo "This is the original code before your edit" + echo "-------------------------------------------------" + _constrain_line + _print + echo "-------------------------------------------------" + + # Restore original values + export CURRENT_LINE=$original_current_line + export WINDOW=$original_window + + echo "Your changes have NOT been applied. Please fix your edit command and try again." + echo "You either need to 1) Specify the correct start/end line arguments or 2) Correct your edit code." + echo "DO NOT re-run the same failed edit command. Running it again will lead to the same error." + fi + + # Remove backup file + rm -f "/root/$(basename "$CURRENT_FILE")_backup" +} diff --git a/config/commands/search.sh b/config/commands/search.sh new file mode 100644 index 000000000..8db25c2b1 --- /dev/null +++ b/config/commands/search.sh @@ -0,0 +1,155 @@ +# @yaml +# signature: search_dir [] +# docstring: searches for search_term in all files in dir. If dir is not provided, searches in the current directory +# arguments: +# search_term: +# type: string +# description: the term to search for +# required: true +# dir: +# type: string +# description: the directory to search in (if not provided, searches in the current directory) +# required: false +search_dir() { + if [ $# -eq 1 ]; then + local search_term="$1" + local dir="./" + elif [ $# -eq 2 ]; then + local search_term="$1" + if [ -d "$2" ]; then + local dir="$2" + else + echo "Directory $2 not found" + return + fi + else + echo "Usage: search_dir []" + return + fi + dir=$(realpath "$dir") + local matches=$(find "$dir" -type f ! -path '*/.*' -exec grep -nIH "$search_term" {} + | cut -d: -f1 | sort | uniq -c) + # if no matches, return + if [ -z "$matches" ]; then + echo "No matches found for \"$search_term\" in $dir" + return + fi + # Calculate total number of matches + local num_matches=$(echo "$matches" | awk '{sum+=$1} END {print sum}') + # calculate total number of files matched + local num_files=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}') + # if num_files is > 100, print an error + if [ $num_files -gt 100 ]; then + echo "More than $num_files files matched for \"$search_term\" in $dir. Please narrow your search." + return + fi + + echo "Found $num_matches matches for \"$search_term\" in $dir:" + echo "$matches" | awk '{$2=$2; gsub(/^\.+\/+/, "./", $2); print $2 " ("$1" matches)"}' + echo "End of matches for \"$search_term\" in $dir" +} + +# @yaml +# signature: search_file [] +# docstring: searches for search_term in file. If file is not provided, searches in the current open file +# arguments: +# search_term: +# type: string +# description: the term to search for +# required: true +# file: +# type: string +# description: the file to search in (if not provided, searches in the current open file) +# required: false +search_file() { + # Check if the first argument is provided + if [ -z "$1" ]; then + echo "Usage: search_file []" + return + fi + # Check if the second argument is provided + if [ -n "$2" ]; then + # Check if the provided argument is a valid file + if [ -f "$2" ]; then + local file="$2" # Set file if valid + else + echo "Usage: search_file []" + echo "Error: File name $2 not found. Please provide a valid file name." + return # Exit if the file is not valid + fi + else + # Check if a file is open + if [ -z "$CURRENT_FILE" ]; then + echo "No file open. Use the open command first." + return # Exit if no file is open + fi + local file="$CURRENT_FILE" # Set file to the current open file + fi + local search_term="$1" + file=$(realpath "$file") + # Use grep to directly get the desired formatted output + local matches=$(grep -nH "$search_term" "$file") + # Check if no matches were found + if [ -z "$matches" ]; then + echo "No matches found for \"$search_term\" in $file" + return + fi + # Calculate total number of matches + local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}') + + # calculate total number of lines matched + local num_lines=$(echo "$matches" | cut -d: -f1 | sort | uniq | wc -l | awk '{$1=$1; print $0}') + # if num_lines is > 100, print an error + if [ $num_lines -gt 100 ]; then + echo "More than $num_lines lines matched for \"$search_term\" in $file. Please narrow your search." + return + fi + + # Print the total number of matches and the matches themselves + echo "Found $num_matches matches for \"$search_term\" in $file:" + echo "$matches" | cut -d: -f1-2 | sort -u -t: -k2,2n | while IFS=: read -r filename line_number; do + echo "Line $line_number:$(sed -n "${line_number}p" "$file")" + done + echo "End of matches for \"$search_term\" in $file" +} + +# @yaml +# signature: find_file [] +# docstring: finds all files with the given name in dir. If dir is not provided, searches in the current directory +# arguments: +# file_name: +# type: string +# description: the name of the file to search for +# required: true +# dir: +# type: string +# description: the directory to search in (if not provided, searches in the current directory) +# required: false +find_file() { + if [ $# -eq 1 ]; then + local file_name="$1" + local dir="./" + elif [ $# -eq 2 ]; then + local file_name="$1" + if [ -d "$2" ]; then + local dir="$2" + else + echo "Directory $2 not found" + return + fi + else + echo "Usage: find_file []" + return + fi + + dir=$(realpath "$dir") + local matches=$(find "$dir" -type f -name "$file_name") + # if no matches, return + if [ -z "$matches" ]; then + echo "No matches found for \"$file_name\" in $dir" + return + fi + # Calculate total number of matches + local num_matches=$(echo "$matches" | wc -l | awk '{$1=$1; print $0}') + echo "Found $num_matches matches for \"$file_name\" in $dir:" + echo "$matches" | awk '{print $0}' +} \ No newline at end of file diff --git a/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml new file mode 100644 index 000000000..80b55eecd --- /dev/null +++ b/config/configs/default_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml @@ -0,0 +1,116 @@ +system_template: |- + SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface. + + The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time. + In addition to typical bash commands, you can also use the following commands to help you navigate and edit files. + + COMMANDS: + {command_docs} + + Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. + If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run. + + RESPONSE FORMAT: + Your shell prompt is formatted as follows: + (Open file: ) $ + + You need to format your output using two fields; discussion and command. + Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example: + DISCUSSION + First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like. + ``` + ls -a + ``` + + You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference. + If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command. + You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above. + However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them. +instance_template: |- + We're currently solving the following issue within our repository. Here's the issue text: + ISSUE: + {issue} + + INSTRUCTIONS: + Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want. + Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command. + When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command. + Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`. + + NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! + + IMPORTANT TIPS: + 1. Always start by trying to replicate the bug that the issues discusses. + If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug. + Then start trying to fix it. + When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed. + + If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, + so that you can be sure that the script indeed ran fine all the way through. + + 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it! + + 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker. + + 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command. + + 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file. + + 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it. + + + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_template: |- + {observation} + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +demonstration_template: | + Here is a demonstration of how to correctly accomplish this task. + It is included to show you how to correctly use the interface. + You do not need to follow exactly what is done in the demonstration. + --- DEMONSTRATION --- + {demonstration} + --- END OF DEMONSTRATION --- +state_command: + name: state + code: | + state() { + local working_dir="$PWD"; + if [ -z $CURRENT_FILE ]; then + echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}'; + else + echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}'; + fi + }; +parse_function: ThoughtActionParser +env_variables: + WINDOW: 200 + OVERLAP: 2 + CURRENT_LINE: 0 + CURRENT_FILE: '' + SEARCH_RESULTS: () + SEARCH_FILES: () + SEARCH_INDEX: 0 + START_INDEX: 0 + END_INDEX: 0 + START_CURSOR: 0 + END_CURSOR: 0 + START_CURSOR_MARK: '"<<<<< START CURSOR >>>>>"' # these have to use double quotes + END_CURSOR_MARK: '"<<<<< END CURSOR >>>>>"' # these have to use double quotes +command_files: +- config/commands/cursors_defaults.sh +- config/commands/cursors_edit_linting.sh +- config/commands/search.sh +- config/commands/_split_string.py +parse_command: ParseCommandDetailed +history_processor: DefaultHistoryProcessor +demonstrations: +- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj diff --git a/config/configs/default_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/default_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml new file mode 100644 index 000000000..14b691814 --- /dev/null +++ b/config/configs/default_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml @@ -0,0 +1,110 @@ +system_template: |- + SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface. + + The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time. + In addition to typical bash commands, you can also use the following commands to help you navigate and edit files. + + COMMANDS: + {command_docs} + + Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. + If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run. + + RESPONSE FORMAT: + Your shell prompt is formatted as follows: + (Open file: ) $ + + You need to format your output using two fields; discussion and command. + Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example: + DISCUSSION + First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like. + ``` + ls -a + ``` + + You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference. + If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command. + You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above. + However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them. +instance_template: |- + We're currently solving the following issue within our repository. Here's the issue text: + ISSUE: + {issue} + + INSTRUCTIONS: + Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want. + Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command. + When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command. + Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`. + + NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! + + IMPORTANT TIPS: + 1. Always start by trying to replicate the bug that the issues discusses. + If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug. + Then start trying to fix it. + When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed. + + If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, + so that you can be sure that the script indeed ran fine all the way through. + + 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it! + + 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker. + + 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command. + + 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file. + + 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it. + + + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_template: |- + {observation} + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +demonstration_template: | + Here is a demonstration of how to correctly accomplish this task. + It is included to show you how to correctly use the interface. + You do not need to follow exactly what is done in the demonstration. + --- DEMONSTRATION --- + {demonstration} + --- END OF DEMONSTRATION --- +state_command: + name: state + code: | + state() { + local working_dir="$PWD"; + if [ -z $CURRENT_FILE ]; then + echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}'; + else + echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}'; + fi + }; +parse_function: ThoughtActionParser +env_variables: + WINDOW: 100 + OVERLAP: 2 + CURRENT_LINE: 0 + CURRENT_FILE: '' + SEARCH_RESULTS: () + SEARCH_FILES: () + SEARCH_INDEX: 0 +command_files: +- config/commands/defaults.sh +- config/commands/search.sh +- config/commands/edit_linting.sh +- config/commands/_split_string.py +parse_command: ParseCommandDetailed +history_processor: DefaultHistoryProcessor +demonstrations: +- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj diff --git a/config/configs/xml_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/xml_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml new file mode 100644 index 000000000..e68761878 --- /dev/null +++ b/config/configs/xml_sys-env_cursors_window100-detailed_cmd_format-full_history-1_demos.yaml @@ -0,0 +1,110 @@ +system_template: |- + SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface. + + The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time. + In addition to typical bash commands, you can also use the following commands to help you navigate and edit files. + + COMMANDS: + {command_docs} + + Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. + If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run. + + RESPONSE FORMAT: + Your shell prompt is formatted as follows: + (Open file: ) $ + + You need to format your output using two fields; discussion and command. + Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example: + DISCUSSION + First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like. + + ls -a + + + You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference. + If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command. + You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above. + However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them. +instance_template: |- + We're currently solving the following issue within our repository. Here's the issue text: + ISSUE: + {issue} + + INSTRUCTIONS: + Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want. + Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command. + When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command. + Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`. + + NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! + + IMPORTANT TIPS: + 1. Always start by trying to replicate the bug that the issues discusses. + If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug. + Then start trying to fix it. + When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed. + + If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, + so that you can be sure that the script indeed ran fine all the way through. + + 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it! + + 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker. + + 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command. + + 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file. + + 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it. + + + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_template: |- + {observation} + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +put_demos_in_history: true +state_command: + name: state + code: | + state() { + local working_dir="$PWD"; + if [ -z $CURRENT_FILE ]; then + echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}'; + else + echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}'; + fi + }; +parse_function: XMLThoughtActionParser +env_variables: + WINDOW: 200 + OVERLAP: 2 + CURRENT_LINE: 0 + CURRENT_FILE: '' + SEARCH_RESULTS: () + SEARCH_FILES: () + SEARCH_INDEX: 0 + START_INDEX: 0 + END_INDEX: 0 + START_CURSOR: 0 + END_CURSOR: 0 + START_CURSOR_MARK: '"<<<<< START CURSOR >>>>>"' # these have to use double quotes + END_CURSOR_MARK: '"<<<<< END CURSOR >>>>>"' # these have to use double quotes +command_files: +- config/commands/cursors_defaults.sh +- config/commands/cursors_edit_linting.sh +- config/commands/search.sh +- config/commands/_split_string.py +parse_command: ParseCommandDetailed +history_processor: DefaultHistoryProcessor +demonstrations: +- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_cursors_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj diff --git a/config/configs/xml_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml b/config/configs/xml_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml new file mode 100644 index 000000000..00cb15c9f --- /dev/null +++ b/config/configs/xml_sys-env_window100-detailed_cmd_format-full_history-1_demos.yaml @@ -0,0 +1,104 @@ +system_template: |- + SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface. + + The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time. + In addition to typical bash commands, you can also use the following commands to help you navigate and edit files. + + COMMANDS: + {command_docs} + + Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. + If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run. + + RESPONSE FORMAT: + Your shell prompt is formatted as follows: + (Open file: ) $ + + You need to format your output using two fields; discussion and command. + Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example: + DISCUSSION + First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like. + + ls -a + + + You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference. + If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command. + You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above. + However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them. +instance_template: |- + We're currently solving the following issue within our repository. Here's the issue text: + ISSUE: + {issue} + + INSTRUCTIONS: + Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want. + Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command. + When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command. + Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`. + + NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! + + IMPORTANT TIPS: + 1. Always start by trying to replicate the bug that the issues discusses. + If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug. + Then start trying to fix it. + When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed. + + If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, + so that you can be sure that the script indeed ran fine all the way through. + + 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it! + + 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker. + + 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command. + + 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file. + + 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it. + + + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_template: |- + {observation} + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +put_demos_in_history: true +state_command: + name: state + code: | + state() { + local working_dir="$PWD"; + if [ -z $CURRENT_FILE ]; then + echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}'; + else + echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}'; + fi + }; +parse_function: XMLThoughtActionParser +env_variables: + WINDOW: 100 + OVERLAP: 2 + CURRENT_LINE: 0 + CURRENT_FILE: '' + SEARCH_RESULTS: () + SEARCH_FILES: () + SEARCH_INDEX: 0 +command_files: +- config/commands/defaults.sh +- config/commands/search.sh +- config/commands/edit_linting.sh +- config/commands/_split_string.py +parse_command: ParseCommandDetailed +history_processor: DefaultHistoryProcessor +demonstrations: +- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__xml_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj diff --git a/config/default.yaml b/config/default.yaml new file mode 100644 index 000000000..14b691814 --- /dev/null +++ b/config/default.yaml @@ -0,0 +1,110 @@ +system_template: |- + SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface. + + The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time. + In addition to typical bash commands, you can also use the following commands to help you navigate and edit files. + + COMMANDS: + {command_docs} + + Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. + If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run. + + RESPONSE FORMAT: + Your shell prompt is formatted as follows: + (Open file: ) $ + + You need to format your output using two fields; discussion and command. + Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example: + DISCUSSION + First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like. + ``` + ls -a + ``` + + You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference. + If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command. + You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above. + However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them. +instance_template: |- + We're currently solving the following issue within our repository. Here's the issue text: + ISSUE: + {issue} + + INSTRUCTIONS: + Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want. + Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command. + When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command. + Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`. + + NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! + + IMPORTANT TIPS: + 1. Always start by trying to replicate the bug that the issues discusses. + If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug. + Then start trying to fix it. + When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed. + + If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, + so that you can be sure that the script indeed ran fine all the way through. + + 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it! + + 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker. + + 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command. + + 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file. + + 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it. + + + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_template: |- + {observation} + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +demonstration_template: | + Here is a demonstration of how to correctly accomplish this task. + It is included to show you how to correctly use the interface. + You do not need to follow exactly what is done in the demonstration. + --- DEMONSTRATION --- + {demonstration} + --- END OF DEMONSTRATION --- +state_command: + name: state + code: | + state() { + local working_dir="$PWD"; + if [ -z $CURRENT_FILE ]; then + echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}'; + else + echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}'; + fi + }; +parse_function: ThoughtActionParser +env_variables: + WINDOW: 100 + OVERLAP: 2 + CURRENT_LINE: 0 + CURRENT_FILE: '' + SEARCH_RESULTS: () + SEARCH_FILES: () + SEARCH_INDEX: 0 +command_files: +- config/commands/defaults.sh +- config/commands/search.sh +- config/commands/edit_linting.sh +- config/commands/_split_string.py +parse_command: ParseCommandDetailed +history_processor: DefaultHistoryProcessor +demonstrations: +- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj diff --git a/config/default_from_url.yaml b/config/default_from_url.yaml new file mode 100644 index 000000000..30b570b8a --- /dev/null +++ b/config/default_from_url.yaml @@ -0,0 +1,112 @@ +system_template: |- + SETTING: You are an autonomous programmer, and you're working directly in the command line with a special interface. + + The special interface consists of a file editor that shows you {WINDOW} lines of a file at a time. + In addition to typical bash commands, you can also use the following commands to help you navigate and edit files. + + COMMANDS: + {command_docs} + + Please note that THE EDIT COMMAND REQUIRES PROPER INDENTATION. + If you'd like to add the line ' print(x)' you must fully write that out, with all those spaces before the code! Indentation is important and code that is not idented correctly will fail and require fixing before it can be run. + + RESPONSE FORMAT: + Your shell prompt is formatted as follows: + (Open file: ) $ + + You need to format your output using two fields; discussion and command. + Your output should always include _one_ discussion and _one_ command field EXACTLY as in the following example: + DISCUSSION + First I'll start by using ls to see what files are in the current directory. Then maybe we can look at some relevant files to see what they look like. + ``` + ls -a + ``` + + You should only include a *SINGLE* command in the command section and then wait for a response from the shell before continuing with more discussion and commands. Everything you include in the DISCUSSION section will be saved for future reference. + If you'd like to issue two commands at once, PLEASE DO NOT DO THAT! Please instead first submit just the first command, and then after receiving a response you'll be able to issue the second command. + You're free to use any other bash commands you want (e.g. find, grep, cat, ls, cd) in addition to the special commands listed above. + However, the environment does NOT support interactive session commands (e.g. python, vim), so please do not invoke them. +instance_template: |- + We're currently solving the following issue within our repository. Here's the issue text: + ISSUE: + {issue} + + INSTRUCTIONS: + Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want. + Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for feedback after every command. + When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command. + Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with `python .py`. + + NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line! + + IMPORTANT TIPS: + 1. Always start by trying to replicate the bug that the issues discusses. + If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug. + Then start trying to fix it. + When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed. + + If the bug reproduction script does not print anything when it succesfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file, + so that you can be sure that the script indeed ran fine all the way through. + + 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it! + + 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker. + + 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doensn't work, use the linux 'find' command. + + 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file. + + 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it. + + 7. It may be necessary to install the repository from source before you can run code. Please think about how to install the environment from the repository directory if you need to do so. + + + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_template: |- + {observation} + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + (Open file: {open_file}) + (Current directory: {working_dir}) + bash-$ +demonstration_template: | + Here is a demonstration of how to correctly accomplish this task. + It is included to show you how to correctly use the interface. + You do not need to follow exactly what is done in the demonstration. + --- DEMONSTRATION --- + {demonstration} + --- END OF DEMONSTRATION --- +state_command: + name: state + code: | + state() { + local working_dir="$PWD"; + if [ -z $CURRENT_FILE ]; then + echo '{"open_file": "n/a", "working_dir": "'$working_dir'"}'; + else + echo '{"open_file": "'$(realpath $CURRENT_FILE)'", "working_dir": "'$working_dir'"}'; + fi + }; +parse_function: ThoughtActionParser +env_variables: + WINDOW: 100 + OVERLAP: 2 + CURRENT_LINE: 0 + CURRENT_FILE: '' + SEARCH_RESULTS: () + SEARCH_FILES: () + SEARCH_INDEX: 0 +command_files: +- config/commands/defaults.sh +- config/commands/search.sh +- config/commands/edit_linting.sh +- config/commands/_split_string.py +parse_command: ParseCommandDetailed +history_processor: DefaultHistoryProcessor +demonstrations: +- trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default__t-0.20__p-0.95__c-2.00__install-1___install_from_source/marshmallow-code__marshmallow-1867.traj diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 000000000..c4773a76c --- /dev/null +++ b/docker/README.md @@ -0,0 +1,70 @@ +# Docker +To ensure reproducibility and sandboxed execution of SWE-agent actions across systems, we adopt practices established in [prior work](https://intercode-benchmark.github.io/) and use [๐Ÿ‹ Docker](https://www.docker.com/) containers to carry out SWE-agent inference. + +* The `swe.Dockerfile` file is the customized image written for the environment of SWE-agent. +* The `./setup.sh` script automatically builds this image. +* When `run.py` is invoked, containers are automatically created from the built image. + * There is no need to manually build a container from the image. + +Here, we explain what each line in `swe.Dockerfile` does: + +1. **Base Image**: Start from the latest version of the Ubuntu image. +```bash +FROM ubuntu:latest +``` +2. **Build Argument**: Define a build argument `MINICONDA_URL` that will be used to specify the Miniconda installer URL during the build process. +```bash +ARG MINICONDA_URL +``` +3. **Install Third-Party Tools**: Update the package lists for the Ubuntu package manager and install several essential development tools. Clean up after the installation. +```bash +RUN apt-get update && \ + apt-get install -y bash gcc git jq wget g++ make && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +``` +4. **Initialize Git**: Configure global Git settings with a user email and name. +```bash +RUN git config --global user.email "sweagent@pnlp.org" +RUN git config --global user.name "sweagent" +``` +5. **Environment Variables**: Set the `ROOT` environment variable and customize the shell prompt. +```bash +ENV ROOT='/dev/' +RUN prompt() { echo " > "; }; +ENV PS1="> " +``` +6. **Create Assets for Inference**: Create two files that are used to track metadata during an episode. +```bash +RUN touch /root/files_to_edit.txt +RUN touch /root/test.patch +``` +7. **Enhance `ls` Command**: Modify the `.bashrc` file to alias the `ls` command. +```bash +RUN echo "alias ls='ls -F'" >> /root/.bashrc +``` +8. Install Miniconda: Download and install Miniconda, then initialize conda with Bash support and add `conda-forge` to the channels list. +```bash +ENV PATH="/root/miniconda3/bin:${PATH}" +ARG PATH="/root/miniconda3/bin:${PATH}" +RUN wget ${MINICONDA_URL} -O miniconda.sh \ + && mkdir /root/.conda \ + && bash miniconda.sh -b \ + && rm -f miniconda.sh +RUN conda --version \ + && conda init bash \ + && conda config --append channels conda-forge +``` +9. **Install Python Packages**: Copy the `requirements.txt` file into the image and install the specified Python packages. +```bash +COPY docker/requirements.txt /root/requirements.txt +RUN pip install -r /root/requirements.txt +``` +10. **Set Working Directory**: Set the working directory to the root directory. +```bash +WORKDIR / +``` +11. **Default Command**: Set the default command to open a Bash shell when the container starts. +```bash +CMD ["/bin/bash"] +``` \ No newline at end of file diff --git a/docker/eval.Dockerfile b/docker/eval.Dockerfile new file mode 100644 index 000000000..50a0c55b0 --- /dev/null +++ b/docker/eval.Dockerfile @@ -0,0 +1,6 @@ +FROM swe-agent + +COPY ../evaluation/evaluation.py /evaluation.py +RUN pip install git+https://github.com/princeton-nlp/SWE-bench.git +RUN pip install unidiff +CMD ["python", "/evaluation.py"] diff --git a/docker/requirements.txt b/docker/requirements.txt new file mode 100644 index 000000000..04cf09e82 --- /dev/null +++ b/docker/requirements.txt @@ -0,0 +1,3 @@ +anthropic +config +openai \ No newline at end of file diff --git a/docker/swe.Dockerfile b/docker/swe.Dockerfile new file mode 100644 index 000000000..ecf5be09c --- /dev/null +++ b/docker/swe.Dockerfile @@ -0,0 +1,44 @@ +FROM ubuntu:jammy + +ARG MINICONDA_URL + +# Install third party tools +RUN apt-get update && \ + apt-get install -y bash gcc git jq wget g++ make && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Initialize git +RUN git config --global user.email "sweagent@pnlp.org" +RUN git config --global user.name "sweagent" + +# Environment variables +ENV ROOT='/dev/' +RUN prompt() { echo " > "; }; +ENV PS1="> " + +# Create file for tracking edits, test patch +RUN touch /root/files_to_edit.txt +RUN touch /root/test.patch + +# add ls file indicator +RUN echo "alias ls='ls -F'" >> /root/.bashrc + +# Install miniconda +ENV PATH="/root/miniconda3/bin:${PATH}" +ARG PATH="/root/miniconda3/bin:${PATH}" +RUN wget ${MINICONDA_URL} -O miniconda.sh \ + && mkdir /root/.conda \ + && bash miniconda.sh -b \ + && rm -f miniconda.sh +RUN conda --version \ + && conda init bash \ + && conda config --append channels conda-forge + +# Install python packages +COPY docker/requirements.txt /root/requirements.txt +RUN pip install -r /root/requirements.txt + +WORKDIR / + +CMD ["/bin/bash"] diff --git a/environment.yml b/environment.yml new file mode 100644 index 000000000..87d980201 --- /dev/null +++ b/environment.yml @@ -0,0 +1,21 @@ +name: swe-agent +dependencies: + - python=3.9 + - pip + - pip: + - anthropic + - config + - datasets + - docker + - gnureadline + - gymnasium + - numpy + - openai>=1.0 + - pandas + - rich + - ruamel.yaml + - swebench>=1.0.1 + - tenacity + - unidiff + - simple-parsing + - together diff --git a/evaluation/README.md b/evaluation/README.md new file mode 100644 index 000000000..4b584fca8 --- /dev/null +++ b/evaluation/README.md @@ -0,0 +1,79 @@ +# Evaluation + +The `evaluation/` folder provides SWE-agent compatible scripts for running [SWE-bench style evaluation](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md) on model patch predictions. In addition, we also include additional scripts to quantify model performance on "subtasks" within the SWE-bench task, such as identifying the right file(s) to edit. + +## ๐Ÿ“– Table of Contents +- [Evaluation](#evaluation) + - [๐Ÿ“– Table of Contents](#-table-of-contents) + - [๐Ÿ‡ Quick Start ](#-quick-start-) + - [๐Ÿช‘ SWE-bench Evaluation ](#-swe-bench-evaluation-) + - [๐Ÿ“ˆ Viewing Results ](#-viewing-results-) + +## ๐Ÿ‡ Quick Start +You can run evaluations on SWE-bench by passing in the predictions generated by SWE-agent (usually named `all_preds.jsonl`). Simply run the following script: + +```bash +./run_eval.sh +``` + +Then run `./run_eval.sh`. Depending on the # of task instances and how long setting up the execution environment takes, the evaluation could take a couple minutes or to 7 hours for the entirety of the SWE-bench test split. + +When evaluation finishes, you should see an output similar to the following: +```bash +2024-03-31 16:47:00,263 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Installing with command: . /n/fs/p-swe-bench/testbed/ba397fe0d6/pvlib__pvlib-python/0.8/tmpom22t9na/miniconda3/bin/activate pvlib__pvlib-python__0.8 && echo 'activate successful' && pip install -e .[all] +2024-03-31 16:47:10,602 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Installation successful +2024-03-31 16:47:10,619 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Apply patch successful (test) +2024-03-31 16:47:10,635 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Apply patch successful (pred) +2024-03-31 16:47:13,453 - taskenv_context_manager - INFO - [pvlib__pvlib-python__0.8] [pvlib__pvlib-python-1395] Test script run successful +================================== +Log directory for evaluation run: /n/fs/p-swe-bench/results/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4 +== Evaluation Report == +{'# Not Generated': 1, '# Generated': 36, '# Applied': 34, '# Resolved': 5} +- Wrote per-instance scorecards to //trajectories/carlosejimenez/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4/scorecards.json +- Wrote summary of run to //trajectories/carlosejimenez/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4/results.json +Reference Report: +{'# Not Generated': 1, '# Generated': 36, '# Applied': 34, '# Resolved': 5} +``` + +## ๐Ÿช‘ SWE-bench Evaluation +`evaluation.py`: This script contains the logic for SWE-bench evaluation adapted for the SWE-agent setting. Given a set of predictions (e.g. `trajectories///all_preds.jsonl`), we... +1. Filter + analyze predictions. +2. Run SWE-bench style execution based evaluation. +3. Save outcomes to `results.json` and `scorecards.json` files with info about task-specific and overall performance. + +> `run_eval.sh` is provided as an example of how to run `evaluation.py` + +Arguments: +* `--predictions_path (required)`: The path to the file containing predictions (.jsonl format). This file includes the predictions that need to be evaluated against the benchmark tasks. +* `--log_dir (required)`: The directory path where log files related to the evaluation process will be stored. It's used for saving logs that are generated during the evaluation. +* `--swe_bench_tasks (required)`: The path to the file containing the SWE-bench task instances. This file includes the details of the tasks against which the predictions will be evaluated. +* `--testbed (required)`: The directory path for the testbed, which is likely used for setting up the environment or context for the evaluations. +* `--skip_existing (optional)`: If specified, the script will skip over log files that already exist, preventing re-evaluation of those tasks. +* `--timeout (optional)`: Specifies the timeout in seconds for the evaluation process (default is 900 seconds). This helps in controlling the duration of each evaluation task to avoid excessively long running times. +* `--verbose (optional)`: Enables verbose mode, which will provide more detailed output during the script execution. This is useful for debugging or getting more insight into the process. +* `--conda_link (optional)`: Allows specifying a URL to a Conda installation that should be used for the evaluation environment. This can be necessary if the evaluation requires a specific software environment. +* `--log_suffix (optional)`: An additional parameter to specify a suffix for log files. This can be used for organizing logs more effectively, especially when running multiple evaluations in parallel or under different configurations. + +## ๐Ÿ“ˆ Viewing Results +`aggregate_results.py`: This script aggregates and displays experiment results from the `trajectories/` folder. +* Experiments are grouped by `(Model, Dataset, Config File, Temp., Top P, Cost, Install)`. +* The following statistics for each experiment run are shown: + * `Not Generated`: # of task instances with no patch generated + * `Generated`: # of task instances with patch + * `Applied`: # of patches that applied successfully + * `Resolved`: # of task instances resolved + * `Costs [Success|Failed|Overall]`: Cost of [successful|failed|any] run +* If there are multiple runs of an experiment (distinguished by `--suffix run`), the above statistics are aggregate as totals or means. + +Usage: +``` +python aggregate_results.py +``` + +Arguments: +* `--folder (type: str, default: ../trajectories)`: Specifies the folder containing the experiment * results. This is where the script will look to gather data. +* `--model (type: str, nargs: '+')`: Filters the results by model(s). Only results corresponding to the * specified model(s) will be included. +* `--dataset (type: str, nargs: '+')`: Filters the results by dataset(s). Only results for the specified * dataset(s) will be analyzed. +* `--setup (type: str, nargs: '+')`: Filters the results by setup(s). This allows focusing on specific * experiment configurations. +* `--runs_min (type: int)`: The minimum number of runs an experiment should have to be included in the * analysis. Helps exclude experiments with insufficient data. +* `--runs_max (type: int)`: The maximum number of runs to consider for each experiment. This can limit the data to the most relevant runs. \ No newline at end of file diff --git a/evaluation/aggregate_results.py b/evaluation/aggregate_results.py new file mode 100644 index 000000000..35978d7ad --- /dev/null +++ b/evaluation/aggregate_results.py @@ -0,0 +1,219 @@ +import argparse +import glob +import json +import numpy as np +import os +import pandas as pd +import warnings +warnings.filterwarnings("ignore") + +from pathlib import Path +from rich import print + +COLUMNS = [ + "Model", + "Dataset", + "Setup", + "Temp.", + "Top P", + "Cost", + "Install", + "Run", + "Not Generated", + "Generated", + "Applied", + "Resolved", + "Resolved IDs", + "Costs Success", + "Costs Failure", + "Costs Overall", +] + + +def get_folders(path): + return [entry for entry in Path(path).iterdir() if entry.is_dir()] + + +def parse_folder_name(folder_name): + """ + Parse the folder name to get the different parts + """ + parsed_folder = folder_name.split("__") + if len(parsed_folder) == 7: + parsed_folder.append("") + return parsed_folder + + +def convert_experiments_to_rows(folder_name, runs_max): + """ + Convert each experiment to a row in the csv + """ + rows = [] + directories = get_folders(folder_name) + for directory in directories: + folders = get_folders(directory) + for folder in folders: + # Skip debug folders + if "debug" in folder.name: + continue + + # Skip fine tuned models + if "ft_gpt-3.5" in folder.name: + continue + + # Skip folders without a results.json file + json_file = folder / "results.json" + if not json_file.exists(): + # print(f"No json file in {folder}") + continue + + # Extract run attributes + folder_data = parse_folder_name(folder.name) + model = folder_data[0] + dataset = folder_data[1] + if dataset.startswith("swe-bench-dev-easy-"): + dataset = dataset[len("swe-bench-dev-easy-") :] + elif dataset.startswith("swe-bench-dev-"): + dataset = dataset[len("swe-bench-dev-") :] + setup = folder_data[2] + if len(folder_data) != 8: + # TODO: This might be too strict? + continue + temperature = float(folder_data[3][len("t-"):].strip()) + top_p = float(folder_data[4][len("p-"):].strip()) + cost = float(folder_data[5][len("c-"):].strip()) + install = "Y" if folder_data[6].strip() == "install-1" else "N" + + # Parse out run number + run = folder_data[-1] + if "run" not in run: + continue + + try: + if "run-" in run: + run = int(run.split("run-")[-1].split("-")[0].replace("_", "").strip()) + else: + run = int(run.split("run")[-1].split("-")[0].replace("_", "").strip()) + except Exception as e: + print(run) + raise e + + if runs_max is not None and run > runs_max: + continue + + # Load results.json file + with json_file.open() as file: + results_data = json.load(file) + report = results_data.get("report", {}) + + # Extract resolved ids (to calculate pass@k) + resolved_ids = [] + if "resolved" in results_data and isinstance(results_data["resolved"], list): + resolved_ids = results_data["resolved"] + elif "counts" in results_data and isinstance(results_data["counts"]["resolved"], list): + resolved_ids = results_data["counts"]["resolved"] + + # Extract instance costs from trajectories + costs_overall = [] + costs_success = [] + costs_failure = [] + for x in glob.glob(os.path.join(str(folder), "*.traj")): + traj_data = json.load(open(x)) + if "model_stats" not in traj_data["info"]: + continue + run_cost = traj_data["info"]["model_stats"]["instance_cost"] + inst_id = x.split("/")[-1].split(".")[0] + costs_overall.append(run_cost) + if inst_id in resolved_ids: + costs_success.append(run_cost) + else: + costs_failure.append(run_cost) + + # Create run row, write to csv + rows.append( + [ + model, + dataset, + setup, + temperature, + top_p, + cost, + install, + run, + report.get("# Not Generated", 0), + report.get("# Generated", 0), + report.get("# Applied", 0), + report.get("# Resolved", 0), + resolved_ids, + costs_success, + costs_failure, + costs_overall, + ] + ) + + return rows + + +def get_results_df(folder_name, runs_max): + rows = convert_experiments_to_rows(folder_name, runs_max) + return ( + pd.DataFrame(rows, columns=COLUMNS) + .sort_values(by=COLUMNS[:8]) + ) + + +def get_results_csv(folder_name): + get_results_df(folder_name).to_csv("results.csv") + print("Experiment results written to results.csv") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Aggregate results from experiments") + parser.add_argument("--folder", type=str, help="Folder containing experiment results", default="../trajectories") + parser.add_argument("--model", nargs='+', type=str, help="Model(s) to filter results by.") + parser.add_argument("--dataset", nargs='+', type=str, help="Dataset to filter results by.") + parser.add_argument("--setup", nargs='+', type=str, help="Setup to filter results by.") + parser.add_argument("--runs_min", type=int, help="Minimum number of runs that experiment should have been run for.") + parser.add_argument("--runs_max", type=int, help="Maximum number of runs taken into account") + args = parser.parse_args() + + df = get_results_df(args.folder, args.runs_max) + + grouped_data = ( + df.groupby(COLUMNS[:7]) + .agg( + { + "Run": "count", # Count the number of runs + "Not Generated": "mean", + "Generated": "mean", + "Applied": "mean", + "Resolved": "mean", + "Resolved IDs": lambda x: len(set([item for sublist in x for item in sublist])), + "Costs Success": lambda x: np.mean([item for sublist in x for item in sublist]), + "Costs Failure": lambda x: np.mean([item for sublist in x for item in sublist]), + "Costs Overall": lambda x: np.mean([item for sublist in x for item in sublist]), + } + ) + .round(2) + .reset_index() + .rename(columns={"Resolved IDs": "Pass@K", "Run": "Runs"}) + ) + + # Filtering + if args.model: + grouped_data = grouped_data[grouped_data['Model'].isin(args.model)] + if args.dataset: + grouped_data = grouped_data[grouped_data['Dataset'].isin(args.dataset)] + if args.setup: + grouped_data = grouped_data[grouped_data['Setup'].isin(args.setup)] + if args.runs_min: + grouped_data = grouped_data[grouped_data['Run'] >= args.runs_min] + + print(f"Total experiments run: {grouped_data.shape[0]}") + grouped_data_sorted = grouped_data.sort_values(by=['Dataset', 'Resolved'], ascending=[True, False]) + pd.set_option("display.max_rows", None) + grouped = grouped_data_sorted.groupby('Dataset') + + for name, group in grouped: + print(f'\n-----------------\nDataset: {name}\n-----------------') + print(group.to_string(index=False)) diff --git a/evaluation/evaluation.py b/evaluation/evaluation.py new file mode 100644 index 000000000..31800ba91 --- /dev/null +++ b/evaluation/evaluation.py @@ -0,0 +1,269 @@ +import argparse +import json +import os +import traceback + +from datasets import load_dataset, load_from_disk +from collections import Counter +from rich import print +from swebench import ( + KEY_INSTANCE_ID, + KEY_MODEL, + KEY_PREDICTION, + get_eval_report, + get_logs_eval, + get_model_report, + get_resolution_status, + run_evaluation, + get_eval_refs, +) +from swebench.harness.constants import ( + INSTALL_FAIL, +) +from unidiff import PatchSet + + +def main(predictions_path, log_dir, swe_bench_tasks, testbed, skip_existing, timeout, verbose, conda_link, log_suffix, num_processes): + # Check if paths exist + if not os.path.exists(predictions_path): + raise FileNotFoundError(f"Predictions path {predictions_path} does not exist") + eval_refs = get_eval_refs(swe_bench_tasks) + for k, v in eval_refs.items(): + eval_refs[k] = {key: v[key] for key in [KEY_INSTANCE_ID, "FAIL_TO_PASS", "PASS_TO_PASS"]} + + # Change model_name_or_patch field to directory name for all predictions + directory = os.path.dirname(predictions_path) + directory_name = directory.rsplit("/", 1)[-1] + pred_path_orig = predictions_path + pred_path_temp = predictions_path.replace(".jsonl", "_filtered.jsonl") + + pred_total, pred_will_eval = 0, 0 + with open(pred_path_temp, "w") as f: + for l in open(pred_path_orig, "r").readlines(): + pred_total += 1 + p = json.loads(l) + # Exclude predictions w/ empty strings + if p[KEY_PREDICTION] is not None and p[KEY_PREDICTION].strip() != "": + p[KEY_MODEL] = directory_name + json.dump(p, f) + f.write("\n") + pred_will_eval += 1 + print( + f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)" + ) + + # Run evaluation + predictions_path = pred_path_temp + try: + print("๐Ÿƒ Beginning evaluation...") + run_evaluation( + predictions_path=predictions_path, + log_dir=log_dir, + swe_bench_tasks=swe_bench_tasks, + testbed=testbed, + skip_existing=skip_existing, + timeout=timeout, + verbose=verbose, + conda_link=conda_link, + log_suffix=log_suffix, + num_processes=num_processes + ) + print("โœ… Finished evaluation") + except Exception as e: + print(f"โŒ Evaluation failed: {e}\n{traceback.format_exc()}") + pass + print("==================================") + os.remove(pred_path_temp) + + # Get predictions, define log_dir + predictions = [json.loads(l) for l in open(pred_path_orig, "r").readlines()] + log_dir = os.path.join(log_dir, directory_name) + print(f"Log directory for evaluation run: {log_dir}") + + # Iterate through predictions + scorecards = [] + for p in predictions: + scorecard = {KEY_INSTANCE_ID: p[KEY_INSTANCE_ID], "statuses": [], "stats": {}} + + # Add trajectory statistics if traj_path exists + traj_path = os.path.join(directory, f"{p[KEY_INSTANCE_ID]}.traj") + if os.path.exists(traj_path): + traj_data = json.load(open(traj_path, "r")) + scorecard["stats"]["traj_num_steps"] = len(traj_data["trajectory"]) + scorecard["stats"]["traj_action_dist"] = dict( + Counter( + [ + entry["action"].strip().split()[0] + if entry["role"] == "assistant" and "action" in entry and len(entry["action"]) > 0 + else None + for entry in traj_data["history"] + ] + ) + ) + scorecard["exit_status"] = ( + traj_data["info"]["exit_status"] + if "exit_status" in traj_data["info"] + else "n/a" + ) + + # Check that a prediction was generated + if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "": + scorecard["statuses"].append("not_generated") + scorecards.append(scorecard) + continue + scorecard["statuses"].append("generated") + + # Get log file + log_path = os.path.join( + log_dir, f"{p[KEY_INSTANCE_ID]}.{directory_name}.eval.log" + ) + if not os.path.exists(log_path): + scorecard["statuses"].append("build_failure") + scorecards.append(scorecard) + continue + + # Get evaluation logs + eval_sm, found = get_logs_eval(log_path) + + # Check that the prediction generated + if not found: + scorecards.append(scorecard) + continue + scorecard["statuses"].append("applied") + + with open(log_path, "r") as f: + log_contents = f.read() + if INSTALL_FAIL in log_contents: + scorecard["statuses"].append("install_fail") + + # Get resolution status + report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]]) + scorecard["test_results"] = { + "failure": { + "FAIL_TO_PASS": report["FAIL_TO_PASS"]["failure"], + "PASS_TO_PASS": report["PASS_TO_PASS"]["failure"], + }, + "success": { + "FAIL_TO_PASS": report["FAIL_TO_PASS"]["success"], + "PASS_TO_PASS": report["PASS_TO_PASS"]["success"], + } + } + resolution_status = get_resolution_status(report) + scorecard["statuses"].append(resolution_status) + + diff_obj = PatchSet(p[KEY_PREDICTION]) + scorecard["patch_files"] = [ + x.path + for x in diff_obj.modified_files + + diff_obj.added_files + + diff_obj.removed_files + ] + scorecard["patch_lines_add"] = sum([f.added for f in diff_obj]) + scorecard["patch_lines_del"] = sum([f.removed for f in diff_obj]) + scorecards.append(scorecard) + + # Calculate cumulative results + get_ids_with_status = lambda x: [ + s[KEY_INSTANCE_ID] for s in scorecards if x in s["statuses"] + ] + report = { + "# Not Generated": len(get_ids_with_status("not_generated")), + "# Generated": len(get_ids_with_status("generated")), + "# Applied": len(get_ids_with_status("applied")), + "# Resolved": len(get_ids_with_status("RESOLVED_FULL")), + "# Install Fail": len(get_ids_with_status("install_fail")), + } + print(f"== Evaluation Report ==\n{report}") + + report_exits = dict( + Counter([s["exit_status"] if "exit_status" in s else "n/a" for s in scorecards]) + ) + + # Save to summary, scorecard json + path_scorecards = os.path.join(directory, "scorecards.json") + with open(path_scorecards, "w") as f: + json.dump(scorecards, fp=f, indent=2) + print(f"- Wrote per-instance scorecards to {path_scorecards}") + + path_results = os.path.join(directory, "results.json") + with open(path_results, "w") as f: + json.dump( + { + "report": report, + "report_exits": report_exits, + "not_generated": get_ids_with_status("not_generated"), + "generated": get_ids_with_status("generated"), + "applied": get_ids_with_status("applied"), + "resolved": get_ids_with_status("RESOLVED_FULL"), + "install_fail": get_ids_with_status("install_fail"), + }, + fp=f, + indent=2, + ) + print(f"- Wrote summary of run to {path_results}") + + # Sanity check against get_model_report + report = get_model_report( + directory_name, pred_path_orig, swe_bench_tasks, log_dir + ) + by_outcome = {} + by_outcome_func = lambda status: len( + [ + instance_id + for _, v in report.items() + if isinstance(v, dict) + for instance_id in v[status] + ] + ) + by_outcome["# Not Generated"] = by_outcome_func("none") + by_outcome["# Generated"] = by_outcome_func("generated") + by_outcome["# Applied"] = by_outcome_func("applied") + by_outcome["# Resolved"] = by_outcome_func("resolved") + by_outcome["# Install Fail"] = by_outcome_func("install_fail") + print(f"Reference Report:\n{by_outcome}") + + +if __name__ == "__main__": + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument( + "--predictions_path", + type=str, + help="Path to predictions file (.jsonl)", + required=True, + ) + parser.add_argument( + "--log_dir", type=str, help="Path to log directory", required=True + ) + parser.add_argument( + "--swe_bench_tasks", + type=str, + help="Path to SWE-bench task instances file", + required=True, + ) + parser.add_argument( + "--testbed", type=str, help="Path to testbed directory", required=True + ) + parser.add_argument( + "--skip_existing", action="store_true", help="(Optional) Skip existing logs" + ) + parser.add_argument( + "--timeout", + type=int, + help="(Optional) Timeout in seconds (default: 900)", + default=900, + ) + parser.add_argument( + "--verbose", action="store_true", help="(Optional) Verbose mode" + ) + parser.add_argument( + "--conda_link", default=None, type=str, help="(Optional) URL to conda installation to use" + ) + parser.add_argument( + "--log_suffix", default=None, type=str, help="(Optional) Log suffix" + ) + parser.add_argument( + "--num_processes", default=-1, type=int, help="Num processes" + ) + args = parser.parse_args() + main(**vars(args)) diff --git a/evaluation/run_eval.sh b/evaluation/run_eval.sh new file mode 100755 index 000000000..c22c10942 --- /dev/null +++ b/evaluation/run_eval.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# The first positional argument +predictions_path=$1 + +# Check if predictions_path is not provided +if [ -z "$predictions_path" ]; then + echo "Usage: $0 [dataset_name_or_path] [results_dir] [testbed_dir]" + exit 1 +fi + +# Default values for the optional arguments +dataset_name_or_path="${2:-princeton-nlp/SWE-bench}" +results_dir="${3:-results}" +testbed_dir="${4:-testbed}" + +# If results or testbed directories do not exist, create them +if [ ! -d "$results_dir" ]; then + mkdir -p "$results_dir" + echo "Created results directory at $results_dir" +fi + +if [ ! -d "$testbed_dir" ]; then + mkdir -p "$testbed_dir" + echo "Created testbed directory at $testbed_dir" +fi + +# Run the Python script with the specified arguments +python evaluation.py \ + --predictions_path "$predictions_path" \ + --swe_bench_tasks "$dataset_name_or_path" \ + --log_dir "$results_dir" \ + --testbed "$testbed_dir" \ + --skip_existing \ + --timeout 900 \ + --verbose diff --git a/inspector/README.md b/inspector/README.md new file mode 100644 index 000000000..7981b2c3c --- /dev/null +++ b/inspector/README.md @@ -0,0 +1,23 @@ +# Inspector +We provide a web interface for visualizing `.traj` files from the `trajectories` folder more easily. + +**Set Up** +* Run `python server.py trajectories` +* Open http://localhost:8000 in your browser to use the inspector. + +**Additional flags** +- `--data_path`: Path to SWE-bench style dataset that trajectories were generated for (Optional) +- `--directory`: Directory of trajectories to inspect (Defaults to `./trajectories` folder) +- `--port`: Port to host web app (Defaults to `8000`). + +**Example Usage** + +From running the command: +``` +python server.py --directory trajectories/carlosejimenez/gpt-4-1106-preview__swe-bench-dev-40-seed24__default_sys-env_window100-detailed_cmd_format-full_history-1_demos__t-0.20__p-0.95__c-4.00__install-1__sweep-01-run-4 +``` +The inspector will then be launched in the browser: + +

+ swe-agent.com +

diff --git a/inspector/__init__.py b/inspector/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/inspector/favicon.ico b/inspector/favicon.ico new file mode 100644 index 000000000..362b4e1b2 Binary files /dev/null and b/inspector/favicon.ico differ diff --git a/inspector/fileViewer.js b/inspector/fileViewer.js new file mode 100644 index 000000000..7cc9896c8 --- /dev/null +++ b/inspector/fileViewer.js @@ -0,0 +1,133 @@ +let currentFileName = null; // Store the current file name +let trajectoryDirectory = ''; // Global variable to store the directory +let timeoutIds = []; // Store timeout IDs for pending operations + + +function getBaseUrl() { + const protocol = window.location.protocol; + const host = window.location.hostname; + const port = window.location.port; + + // Use the default port if the port number is empty (for standard HTTP/HTTPS) + const defaultPort = (protocol === 'http:' && !port) ? '80' : (protocol === 'https:' && !port) ? '443' : port; + + return `${protocol}//${host}:${defaultPort}`; +} + + +function fetchFiles() { + const baseUrl = getBaseUrl(); + fetch(`${baseUrl}/files`) + .then(response => response.json()) + .then(files => { + const fileList = document.getElementById('fileList'); + fileList.innerHTML = ''; + files.forEach(file => { + const fileElement = document.createElement('li'); + fileElement.textContent = file; + fileElement.onclick = () => viewFile(file.split(' ')[0]); + fileList.appendChild(fileElement); + }); + }); +} + +function viewFile(fileName) { + // Clear any pending message loading from previous files + timeoutIds.forEach(timeoutId => clearTimeout(timeoutId)); + timeoutIds = []; // Reset the list of timeout IDs + + const baseUrl = getBaseUrl(); + fetch(`${baseUrl}/trajectory/${fileName}`) + .then(response => { + if (!response.ok) { + throw new Error('Network response was not ok'); + } + return response.json(); + }) + .then(content => { + const container = document.getElementById('fileContent'); + container.innerHTML = ''; // Clear existing content + + if (content.history && Array.isArray(content.history)) { + let delay = 200; // Initial delay + const delayIncrement = 50; // Delay between each message, in milliseconds + + content.history.forEach((item, index) => { + const timeoutId = setTimeout(() => { + const contentText = item.content ? item.content.replace(//g, '>') : ''; + let roleClass = item.agent && item.agent !== "primary" ? "subroutine" : item.role ? item.role.toLowerCase().replaceAll(' ', '-') : 'default'; + const elementId = 'historyItem' + index; + const historyItem = document.createElement('div'); + historyItem.className = `history-item ${roleClass} fade-in`; + historyItem.id = elementId; + if (contentText.includes("--- DEMONSTRATION ---")) { + item.role = "demo"; + } + else if ('is_demo' in item && item.is_demo === true) { + item.role += '[demo]'; + } + historyItem.innerHTML = ` +
+ + ${item.role} + +
+
+
${contentText}
+
+
+ `; + container.appendChild(historyItem); + }, delay); + + delay += delayIncrement; // Increment delay for the next message + timeoutIds.push(timeoutId); // Store the timeout ID + }); + } else { + container.textContent = 'No history content found.'; + } + }) + .catch(error => { + console.error('Error fetching file:', error); + document.getElementById('fileContent').textContent = 'Error loading content. ' + error; + }); + + // Highlight the selected file in the list + document.querySelectorAll('#fileList li').forEach(li => { + li.classList.remove('selected'); + if (li.textContent.split(' ')[0] === fileName) { + li.classList.add('selected'); + } + }); +} + +function refreshCurrentFile() { + if (currentFileName) { + const currentScrollPosition = document.documentElement.scrollTop || document.body.scrollTop; + viewFile(currentFileName.split(' ')[0]); // Reload the current file + // Restore the scroll position after the content is loaded + setTimeout(() => { + window.scrollTo(0, currentScrollPosition); + }, 100); + } +} + +function fetchDirectoryInfo() { + const baseUrl = getBaseUrl(); + fetch(`${baseUrl}/directory_info`) + .then(response => response.json()) + .then(data => { + if (data.directory) { + trajectoryDirectory = data.directory; // Store the directory + document.title = `Trajectory Viewer: ${data.directory}`; + document.querySelector('h1').textContent = `Trajectory Viewer: ${data.directory}`; + } + }) + .catch(error => console.error('Error fetching directory info:', error)); +} + + +window.onload = function() { + fetchFiles(); + fetchDirectoryInfo(); +}; \ No newline at end of file diff --git a/inspector/icons/computer.png b/inspector/icons/computer.png new file mode 100644 index 000000000..11a27054b Binary files /dev/null and b/inspector/icons/computer.png differ diff --git a/inspector/icons/edit_icon.svg b/inspector/icons/edit_icon.svg new file mode 100644 index 000000000..11bcba7d7 --- /dev/null +++ b/inspector/icons/edit_icon.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/inspector/icons/swe-agent-logo-50.png b/inspector/icons/swe-agent-logo-50.png new file mode 100644 index 000000000..4ff29d077 Binary files /dev/null and b/inspector/icons/swe-agent-logo-50.png differ diff --git a/inspector/icons/swellama_blue.png b/inspector/icons/swellama_blue.png new file mode 100644 index 000000000..4797ff1a6 Binary files /dev/null and b/inspector/icons/swellama_blue.png differ diff --git a/inspector/icons/swellama_brown.png b/inspector/icons/swellama_brown.png new file mode 100644 index 000000000..abf131d1b Binary files /dev/null and b/inspector/icons/swellama_brown.png differ diff --git a/inspector/icons/swellama_grey.png b/inspector/icons/swellama_grey.png new file mode 100644 index 000000000..349ed0909 Binary files /dev/null and b/inspector/icons/swellama_grey.png differ diff --git a/inspector/icons/swellama_tan.png b/inspector/icons/swellama_tan.png new file mode 100644 index 000000000..0910a08be Binary files /dev/null and b/inspector/icons/swellama_tan.png differ diff --git a/inspector/index.html b/inspector/index.html new file mode 100644 index 000000000..a5613bff9 --- /dev/null +++ b/inspector/index.html @@ -0,0 +1,18 @@ + + + Trajectory Viewer + + + + +
+

Trajectory File Viewer

+
    +

    Conversation History

    +
    No file selected.
    +
    + +
    +
    + + diff --git a/inspector/server.py b/inspector/server.py new file mode 100644 index 000000000..81910683b --- /dev/null +++ b/inspector/server.py @@ -0,0 +1,304 @@ +import http.server +import json +import os +import socketserver +import traceback +import yaml + +from pathlib import Path +from argparse import ArgumentParser +from functools import partial + + +def append_exit(content): + last_entry = content["history"][-1] + if last_entry["role"] == "system": + return content + + exit_status = content.get("info", {}).get("exit_status", None) + + if exit_status is None: + return content + + if exit_status.startswith("submitted"): + if "submission" in content["info"]: + submission = content["info"]["submission"] + content["history"].append({ + "role": "model_patch", + "content": submission, + }) + # else submission should be in history already + else: + raise ValueError("No submission in history or info") + # elif content.get("info", {}).get("exit_status", None) is not None: + # content["history"].append({ + # "role": "system", + # "content": f"Exited - {content['info']['exit_status']}", + # }) + return content + + +def append_patch(instance_id, content, patches, patch_type): + if content.get("info", {}).get("exit_status", None) is not None: + if instance_id in patches: + content["history"].append({ + "role": f"{patch_type} Patch", + "content": patches[instance_id], + }) + return content + + +def append_results(traj_path, instance_id, content, results, results_file, scorecards, scorecards_file): + stats = [] + model_stats = {} + if traj_path.exists(): + data = json.loads(traj_path.read_text()) + info = data.get("info", {}) + model_stats = info.get("model_stats", {}) + instance_cost = model_stats.get("instance_cost", None) + instance_cost = f'{instance_cost:.2f}' if instance_cost is not None else 'N/A' + tokens_sent = model_stats.get("tokens_sent", None) + tokens_sent = f'{tokens_sent:,}' if tokens_sent is not None else 'N/A' + tokens_received = model_stats.get("tokens_received", None) + tokens_received = f'{tokens_received:,}' if tokens_received is not None else 'N/A' + api_calls = model_stats.get("api_calls", None) + api_calls = f'{api_calls:,}' if api_calls is not None else 'N/A' + stats.append(f"**** Run Stats ****") + stats.append(f"Instance Cost: ${instance_cost}") + stats.append(f"Tokens Sent: {tokens_sent}") + stats.append(f"Tokens Received: {tokens_received}") + stats.append(f"API Calls: {api_calls}\n") + status = [] + if results is None: + status.append("Evaluation results not found") + elif "not_generated" in results and "generated" in results and "applied" in results and "resolved" in results: + is_generated = instance_id in results["generated"] + is_applied = instance_id in results["applied"] + is_resolved = instance_id in results["resolved"] + + status.append("**** Statuses ****") + status.append( + f" {'โœ…' if is_generated else 'โŒ'} Generated (The agent was {'' if is_generated else 'not '}" + "able to generate a pull request to address this issue)") + status.append( + f" {'โœ…' if is_applied else 'โŒ'} Applied (The pull request was {'' if is_applied else 'not '}" + "successfully applied to the repo during eval)") + status.append( + f" {'โœ…' if is_resolved else 'โŒ'} Resolved (The pull request {'' if is_resolved else 'not '}" + "successfully resolved the issue during eval)") + else: + status.append("Results format not recognized") + + if scorecards is not None: + scorecard = [x for x in scorecards if x["instance_id"] == instance_id][0] + if "test_results" in scorecard and "failure" in scorecard["test_results"] and ( + len(scorecard["test_results"]["failure"]["FAIL_TO_PASS"]) > 0 or + len(scorecard["test_results"]["failure"]["PASS_TO_PASS"]) > 0 + ): + tests_failing = [ + f" - {x}" for x in scorecard["test_results"]["failure"]["FAIL_TO_PASS"] + ] + [ + f" - {x}" for x in scorecard["test_results"]["failure"]["PASS_TO_PASS"] + ] + status.extend(["", "**** Test Results ****", "๐Ÿงช Tests Failed"] + tests_failing[:7]) + if len(tests_failing) > 7: + status.append(f" ... and {len(tests_failing) - 7} more") + status.append("") + + if status == []: + status.append("Instance not found in results") + else: + status.append("---------------------------") + status.append("Note that the evaluation results here may not be accurate or up to date, since they are computed seperately from the agent run itself.") + results_relative = results_file.resolve().relative_to(Path(__file__).resolve().parent.parent) + status.append(f"Check {results_relative} for the most accurate evaluation results.") + status.append("") + status.append(f"Instance ID: {instance_id}") + status.append("Based on results:") + status.append(json.dumps(results, indent=4)) + eval_report = { + "role": "Evaluation Report", + "content": "\n".join([*stats, *status]), + } + content["history"].insert(0, eval_report) + content["history"].append(eval_report) + return content + + +def load_content(file_name, gold_patches, test_patches): + with open(file_name) as infile: + content = json.load(infile) + results_file = Path(file_name).parent / "results.json" + results = None + if results_file.exists(): + with open(results_file) as infile: + results = json.load(infile) + + scorecards_file = Path(file_name).parent / "scorecards.json" + scorecards = None + if scorecards_file.exists(): + with open(scorecards_file) as infile: + scorecards = json.load(infile) + + content = append_exit(content) # accomodate new and old format + content = append_patch(Path(file_name).stem, content, gold_patches, "Gold") + content = append_patch(Path(file_name).stem, content, test_patches, "Test") + content = append_results( + Path(file_name), + Path(file_name).stem, + content, + results, + results_file, + scorecards, + scorecards_file, + ) + return content + + +def load_results(traj_path): + results_file = Path(traj_path).parent / "results.json" + if results_file.exists(): + with open(results_file) as infile: + return json.load(infile) + return None + + +def get_status(traj_path): + results = load_results(traj_path) + instance_id = Path(traj_path).stem + if results is None: + return "โ“" + elif "not_generated" in results and "generated" in results and "applied" in results and "resolved" in results: + if instance_id in results["not_generated"]: + return "โ“" + if instance_id in results["generated"]: + if instance_id in results["resolved"]: + return "โœ…" + else: + return "โŒ" + return "โ“" + + +class Handler(http.server.SimpleHTTPRequestHandler): + file_mod_times = {} # Dictionary to keep track of file modification times + + def __init__(self, *args, **kwargs): + self.gold_patches = {} + self.test_patches = {} + if "gold_patches" in kwargs: + self.gold_patches = kwargs.pop("gold_patches") + if "test_patches" in kwargs: + self.test_patches = kwargs.pop("test_patches") + self.traj_dir = kwargs.pop('directory', '.') # Extract directory + super().__init__(*args, **kwargs) + + def serve_directory_info(self): + self.send_response(200) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps({"directory": self.traj_dir}).encode()) + + def serve_file_content(self, file_path): + try: + content = load_content( + Path(self.traj_dir) / file_path, + self.gold_patches, + self.test_patches, + ) + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(json.dumps(content).encode()) + except FileNotFoundError: + self.send_error(404, f"File {file_path} not found") + + def do_GET(self): + if self.path == '/directory_info': + self.serve_directory_info() + elif self.path.startswith('/files'): + self.handle_files_request() + elif self.path.startswith('/trajectory/'): + file_path = self.path[len('/trajectory/'):] + self.serve_file_content(file_path) + elif self.path.startswith('/check_update'): + self.check_for_updates() + else: + super().do_GET() + + def handle_files_request(self): + self.send_response(200) + self.send_header('Content-type', 'application/json') + self.end_headers() + files = sorted( + [ + str(file.relative_to(Path(self.traj_dir))) + " " * 4 + get_status(file) + for file in Path(self.traj_dir).glob('**/*.traj') + ], + key=lambda x: str(Path(self.traj_dir) / x), reverse=True + ) + self.wfile.write(json.dumps(files).encode()) + + def check_for_updates(self): + current_mod_times = {str(file): os.path.getmtime(file) for file in Path(self.traj_dir).glob('**/*.traj')} + if current_mod_times != Handler.file_mod_times: + Handler.file_mod_times = current_mod_times + self.send_response(200) # Send response that there's an update + else: + self.send_response(204) # Send no content response if no update + self.end_headers() + + def end_headers(self): + self.send_header('Access-Control-Allow-Origin', '*') + super().end_headers() + + +def main(data_path, directory, port): + data = [] + if data_path is not None: + if data_path.endswith(".jsonl"): + data = [json.loads(x) for x in open(data_path).readlines()] + elif data_path.endswith(".json"): + data = json.load(open(data_path)) + elif "args.yaml" in os.listdir(directory): + args = yaml.safe_load(open(os.path.join(directory, "args.yaml"))) + if "environment" in args and "data_path" in args["environment"]: + data_path = os.path.join( + Path(__file__).parent, "..", + args["environment"]["data_path"] + ) + if os.path.exists(data_path): + data = json.load(open(data_path, "r")) + + gold_patches = { + d["instance_id"]: d["patch"] + if "patch" in d else None for d in data + } + test_patches = { + d["instance_id"]: d["test_patch"] + if "test_patch" in d else None for d in data + } + + handler_with_directory = partial( + Handler, + directory=directory, + gold_patches=gold_patches, + test_patches=test_patches, + ) + try: + with socketserver.TCPServer(("", port), handler_with_directory) as httpd: + print(f"Serving at http://localhost:{port}") + httpd.serve_forever() + except OSError as e: + if e.errno == 48: + print(f"ERROR: Port ({port}) is already in use. Try another port with the --port flag.") + else: + raise e + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--data_path", type=str, help="Path to dataset that was used for the trajectories") + parser.add_argument("--directory", type=str, help="Directory to serve", default="./trajectories", nargs='?') + parser.add_argument("--port", type=int, help="Port to serve", default=8000) + args = parser.parse_args() + main(**vars(args)) diff --git a/inspector/static.py b/inspector/static.py new file mode 100644 index 000000000..6288ad18d --- /dev/null +++ b/inspector/static.py @@ -0,0 +1,169 @@ +import json +import logging +import yaml +from pathlib import Path +from tqdm.auto import tqdm +from argparse import ArgumentParser +try: + from .server import load_content +except ImportError: + from server import load_content + + +logger = logging.getLogger(__name__) +logging.getLogger("simple_parsing").setLevel(logging.INFO) + + +TEMPLATE = """ + + + Trajectory Viewer + + + +
    + {file_path_tree} +

    Conversation History

    +
    {file_content}
    +
    + + +""" + +try: + with open(Path(__file__).parent / 'style.css', 'r') as infile: + STYLE_SHEET = infile.read() +except Exception as e: + style_file = Path(__file__).parent / 'style.css' + logger.error(f"Failed to load style sheet from {style_file}: {e}") + raise e + + +def _load_file(file_name, gold_patches, test_patches): + try: + role_map = { + "user": "Computer", + "assistant": "SWE-Agent", + "subroutine": "SWE-Agent subroutine", + "default": "Default", + "system": "System", + } + content = load_content(file_name, gold_patches, test_patches) + if 'history' in content and isinstance(content['history'], list): + history_content = "" + for index, item in enumerate(content['history']): + item_content = item.get('content', '').replace('<', '<').replace('>', '>') + if item.get('agent') and item['agent'] != "primary": + role_class = "subroutine" + else: + role_class = item.get('role', 'default').lower().replace(' ', '-') + element_id = f"historyItem{index}" + role_name = role_map.get(item.get('role', ''), item.get('role', '')) + history_content += ( + f'''
    ''' + f'''
    {role_name}
    ''' + f'''
    ''' + f'''
    {item_content}
    ''' + f'''
    ''' + f'''
    ''' + f'''
    ''' + ) + return history_content + else: + return 'No history content found.' + except Exception as e: + return f"Error loading content. {e}" + + +def _make_file_path_tree(file_path): + path_parts = file_path.split('/') + relevant_parts = path_parts[-3:] + html_string = '
    \n' + for part in relevant_parts: + html_string += f'
    {part}
    \n' + html_string += '
    ' + return html_string + + +def save_static_viewer(file_path): + if not isinstance(file_path, Path): + file_path = Path(file_path) + data = [] + if "args.yaml" in list(map(lambda x: x.name, file_path.parent.iterdir())): + args = yaml.safe_load(open(file_path.parent / "args.yaml", "r")) + if "environment" in args and "data_path" in args["environment"]: + data_path = Path(__file__).parent.parent / args["environment"]["data_path"] + if data_path.exists(): + data = json.load(open(data_path, "r")) + if ( + not isinstance(data, list) or + not data or + 'patch' not in data[0] or + 'test_patch' not in data[0] + ): + data = [] + gold_patches = {x["instance_id"]: x["patch"] for x in data} + test_patches = {x["instance_id"]: x["test_patch"] for x in data} + content = _load_file(file_path, gold_patches, test_patches) + file_path_tree = _make_file_path_tree(file_path.absolute().as_posix()) + icons_path = Path(__file__).parent / 'icons' + relative_icons_path = find_relative_path(file_path, icons_path) + style_sheet = STYLE_SHEET.replace( + "url('icons/", + f"url('{relative_icons_path.as_posix()}/" + ).replace( + 'url("icons/', + f'url("{relative_icons_path.as_posix()}/' + ) + data = TEMPLATE.format(file_content=content, style_sheet=style_sheet, file_path_tree=file_path_tree) + output_file = file_path.with_suffix('.html') + with open(output_file, 'w+') as outfile: + print(data, file=outfile) + logger.info(f"Saved static viewer to {output_file}") + +def find_relative_path(from_path, to_path): + # Convert paths to absolute for uniformity + from_path = from_path.resolve() + to_path = to_path.resolve() + if from_path.is_file(): + from_path = from_path.parent + if to_path.is_file(): + to_path = to_path.parent + if not from_path.is_dir() or not to_path.is_dir(): + raise ValueError(f"Both from_path and to_path must be directories, but got {from_path} and {to_path}") + + # Identify the common ancestor and the parts of each path beyond it + common_parts = 0 + for from_part, to_part in zip(from_path.parts, to_path.parts): + if from_part != to_part: + break + common_parts += 1 + + # Calculate the '../' needed to get back from from_path to the common ancestor + back_to_ancestor = ['..'] * (len(from_path.parts) - common_parts) + + # Direct path from common ancestor to to_path + to_target = to_path.parts[common_parts:] + + # Combine to get the relative path + relative_path = Path(*back_to_ancestor, *to_target) + return relative_path + + +def save_all_trajectories(directory): + if not isinstance(directory, Path): + directory = Path(directory) + all_files = list(directory.glob('**/*.traj')) + logger.info(f"Found {len(all_files)} trajectory files in {directory}") + for file_path in tqdm(all_files, desc="Saving static viewers"): + save_static_viewer(file_path) + logger.info(f"Saved static viewers for all trajectories in {args.directory}") + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("directory", type=str, help="Directory containing trajectory files") + args = parser.parse_args() + save_all_trajectories(args.directory) \ No newline at end of file diff --git a/inspector/style.css b/inspector/style.css new file mode 100644 index 000000000..fd7c5921e --- /dev/null +++ b/inspector/style.css @@ -0,0 +1,254 @@ +body { + font-family: Arial, sans-serif; + margin: 0; + padding: 20px; + background-color: #f4f4f4; +} +h1, h2 { + color: #333; +} +#fileList { + list-style-type: none; + padding: 10px; + max-height: 400px; + overflow-y: auto; + margin: 0; + background-color: #eee; + border: 3px solid #1697e2; + border-radius: 10px; +} +#fileList li { + cursor: pointer; + padding: 10px; + background-color: #fff; + margin-bottom: 5px; + border: 2px solid #333; + border-radius: 5px; + transition: background-color 0.3s; +} +#fileList li:hover { + background-color: #bbb; +} +#fileList li.selected { + border-color: #fabb00; +} +#fileContent { + background-color: #fff; + border: 1px solid #ddd; + padding: 10px; + margin-top: 20px; + white-space: pre-wrap; +} +.button-container { + display: flex; + justify-content: center; + align-items: center; +} +#refreshButton { + padding: 4px 10px; + min-width: 80px; + border: none; + font: inherit; + color: #373030; + border-radius: 10px; + outline: none; + text-decoration: none; + cursor: default; + font-weight: 400; + background: #fff; + box-shadow: 0px 0px 1px #0000004d, 0px 1px 1px #00000066; +} +#refreshButton:hover { + /* hover MUST come before active */ + background: linear-gradient(#0000004d, #00000066); + color: #fff; + position: relative; +} +#refreshButton:active { + background: linear-gradient(#4faefc, #006bff); + color: #fff; + position: relative; +} +.history-item { + border: 3px solid black; + border-radius: 5px; + padding: 0px; + /* padding-bottom: 5%; */ + margin-bottom: 15px; + overflow-x: hidden; + white-space: normal; + /* overflow-x: auto; Enables horizontal scrolling */ + /* white-space: nowrap; Keeps content in a single line */ + max-height: 450px; /* Adjust as needed for 25 lines */ + overflow: hidden; + position: relative; +} +.shadow { + height: 30px; /* Height of the shadow */ + background: linear-gradient(to bottom, transparent, rgba(0, 0, 0, 0.4)); + position: absolute; + bottom: 0; + left: 0; + right: 0; + pointer-events: none; /* Ensures the shadow doesn't interfere with interaction */ + display: none; /* Initially hidden */ +} +.has-shadow .shadow { + display: block; +} +.content-container { + max-height: 400px; /* Adjust as needed */ + overflow-y: auto; + position: relative; + padding: 10px; +} +.content-container pre { + white-space: pre-wrap; /* Wrap lines and preserve whitespace */ + overflow-wrap: break-word; /* Handle long words */ +} +.container { + max-width: 1000px; + margin: 0 auto; /* Centers the container */ + padding: 20px; /* Optional: for some inner spacing */ +} +.role-bar { + background-color: #fff; + padding: 5px 10px; + margin: 0; + border-bottom: 1px solid #fff; + font-size: 18px; + display: flex; + align-items: center; + background-repeat: no-repeat; + background-size: 22px 22px; + padding-left: 35px; /* Adjust according to your icon size */ + padding-right: 35px; /* Same padding on the right for balance */ +} +.role-bar.user { + background-image: url('icons/swellama_brown.png'); + background-color: #9cdcff; + justify-content: flex-end; /* Aligns content to the right */ + background-position: right 10px center; /* Moves the image to the right */ + text-align: right; /* Text aligned right */ +} +.role-bar.tool { + background-image: url('icons/swellama_brown.png'); + background-color: #1eacf8; + justify-content: flex-end; /* Aligns content to the right */ + background-position: right 10px center; /* Moves the image to the right */ + text-align: right; /* Text aligned right */ +} +.role-bar.system { + background-image: url('icons/swellama_blue.png'); + background-color: #8aa8bd; + justify-content: center; /* Center the flex container contents */ + background-position: calc(50% - 32px) center; /* Adjust position */ + text-align: center; /* Center the text */ +} +.role-bar.system span { + padding-left: 32px; +} +.role-bar.subroutine { + background-image: url('icons/swellama_blue.png'); + background-color: #7bb776; + justify-content: center; + background-position: calc(50% - 45px) center; + text-align: center; +} +.role-bar.subroutine span { + padding-left: 45px; +} +.role-bar.model-patch { + background-image: url('icons/swellama_blue.png'); + background-color: #bdbdbd; + justify-content: center; + background-position: calc(50% - 45px) center; + text-align: center; +} +.role-bar.gold-patch { + background-image: url('icons/swellama_blue.png'); + background-color: #ffc65b; + justify-content: center; + background-position: calc(50% - 46px) center; + text-align: center; +} +.role-bar.gold-patch span { + padding-left: 46px; +} +.role-bar.test-patch { + background-image: url('icons/swellama_blue.png'); + background-color: #8888ff; + justify-content: center; + background-position: calc(50% - 46px) center; + text-align: center; +} +.role-bar.test-patch span { + padding-left: 46px; +} +.role-bar.assistant { + background-image: url('icons/swe-agent-logo-50.png'); + background-color: #FFF7EC; + background-position: left 10px center; +} +.role-bar.evaluation-report { + background-image: url('icons/swellama_blue.png'); + background-color: #86ad7a; + justify-content: center; + background-position: calc(50% - 96px) center; + text-align: center; +} +.role-bar.evaluation-report span { + padding-left: 23px; +} +.history-item.user { + border-color: #1697e2; +} +.history-item.tool { + border-color: #1483c3; +} +.history-item.system { + border-color: #004b80; +} +.history-item.subroutine { + border-color: #006b00; +} +.history-item.gold-patch { + border-color: #fabb00; +} +.history-item.assistant { + border-color: rgb(0, 0, 0); +} +.history-item.test-patch { + border-color: #7373d9; +} +.history-item.evaluation-report { + border-color: #35614b; +} + + +/* filepath-tree stuff */ +.filepath { + display: flex; + flex-direction: column; /* Changes layout to one part per line */ + align-items: flex-start; /* Aligns parts to the start of the container */ + font-size: 16px; + gap: 10px; + padding: 5px; + background-color: #f4f4f4; + } + .part { + border: 1px solid #ddd; + white-space: nowrap; /* Prevents wrapping within parts */ + padding: 5px; + background-color: #fff; + border-radius: 5px; + } + +@keyframes fadeIn { + from { opacity: 0; } + to { opacity: 1; } +} + +.fade-in { + animation: fadeIn 1s ease-out; +} diff --git a/inspector/trajectories b/inspector/trajectories new file mode 120000 index 000000000..78c5e5b61 --- /dev/null +++ b/inspector/trajectories @@ -0,0 +1 @@ +../trajectories \ No newline at end of file diff --git a/make_demos/README.md b/make_demos/README.md new file mode 100644 index 000000000..2b0094ce5 --- /dev/null +++ b/make_demos/README.md @@ -0,0 +1,15 @@ +# Make demos +An important way to show LMs how to use commands and interact with the environment is through providing a demonstration - which is basically a completed trajectory that the LM can learn from. + +For simplicity we just ingest demonstration in the from of a trajectory file. However, since trajectory files are usually JSON, you can convert them to yaml using the `convert_traj_to_demo.py` script. + +Demo (yaml) files are stored in the `make_demos/demos` directory by default and consist primarily of the sequence of actions that an LM would need to take to complete a task. It's important that your demo have the proper format to be parsed by SWE-agent and your config.s + +Here's how you can make a demo: +1. Find a basic trajectory that you already like and want to use as the basis for your demo. + - For instance, consider the `trajectories/demonstrations/replay__marshmallow-code__marshmallow-1867__default_sys-env_window100__t-0.20__p-0.95__c-2.00__install-1/marshmallow-code__marshmallow-1867.traj` trajectory for reference. +2. Run `python convert_traj_to_demo.py ` to convert the trajectory to a demo. + - This demo will be saved as a readable yaml file in the make_demos directory. +3. Edit the demo by hand to make it work for your particular use case and configuration. +4. Run `python run_replay.py --traj_path --config_file ` to execute the demo and ensure it works as expected. +5. Inspect the resulting trajectory to ensure it was executed correctly. \ No newline at end of file diff --git a/make_demos/convert_traj_to_demo.py b/make_demos/convert_traj_to_demo.py new file mode 100644 index 000000000..c171910c8 --- /dev/null +++ b/make_demos/convert_traj_to_demo.py @@ -0,0 +1,95 @@ +import json +import io +from ruamel.yaml import YAML +from ruamel.yaml.scalarstring import LiteralScalarString as LSS +from pathlib import Path +from argparse import ArgumentParser + + +DEMO_COMMENT = """# This is a demo file generated from trajectory file: +# {traj_path} +# You can use this demo file to replay the actions in the trajectory with run_replay.py. +# You can edit the content of the actions in this file to modify the replay behavior. +# NOTICE: +# Only the actions of the assistant will be replayed. +# You do not need to modify the observation's contents or any other fields. +# You can add or remove actions to modify the replay behavior.""" + + +def convert_to_literal_string(d): + """ + Convert any multi-line strings to LiteralScalarString + """ + if isinstance(d, dict): + for key, value in d.items(): + if isinstance(value, str) and '\n' in value: + d[key] = LSS(value.replace('\r\n', '\n').replace('\r', '\n')) + elif isinstance(value, dict): + convert_to_literal_string(value) + elif isinstance(d, list): + for i, item in enumerate(d): + if isinstance(item, str) and '\n' in item: + d[i] = LSS(item.replace('\r\n', '\n').replace('\r', '\n')) + elif isinstance(item, dict): + convert_to_literal_string(item) + elif isinstance(d, str) and '\n' in d: + d = LSS(d.replace('\r\n', '\n').replace('\r', '\n')) + else: + raise ValueError(f"Unsupported type: {type(d)}") + return d + + +def save_demo(data, file, traj_path): + """ + Save a single task instance as a yaml file + """ + data = convert_to_literal_string(data) + yaml = YAML() + yaml.indent(mapping=2, sequence=4, offset=2) + buffer = io.StringIO() + yaml.dump(data, buffer) + content = buffer.getvalue() + header = DEMO_COMMENT.format(traj_path=traj_path) + with open(file, "w") as f: + f.write(f"{header}\n{content}") + + +def convert_traj_to_action_demo(traj_path: str, output_file: str = None, include_user: bool = False): + traj = json.load(open(traj_path)) + history = traj["history"] + action_traj = list() + admissable_roles = {"assistant", "user"} if include_user else {"assistant"} + for step in history: + if step['role'] in admissable_roles and step.get('agent', 'primary') == 'primary': + action_traj.append({k: v for k, v in step.items() if k in {'content', 'role'}}) + save_demo(action_traj, output_file, traj_path) + print(f"Saved demo to {output_file}") + + +def main(traj_path: str, output_dir: str = None, suffix: str = "", overwrite: bool = False, include_user: bool = False): + filename = '/'.join([Path(traj_path).parent.name + suffix, Path(traj_path).name.rsplit('.traj', 1)[0]]) + ".demo.yaml" + output_file = Path(output_dir) / filename + if output_file.exists() and not overwrite: + raise FileExistsError(f"Output file already exists: {output_file}") + output_file.parent.mkdir(parents=True, exist_ok=True) + convert_traj_to_action_demo(traj_path, output_file, include_user) + + +def string2bool(s): + if s.lower() in {"true", "1"}: + return True + elif s.lower() in {"false", "0"}: + return False + else: + raise ValueError(f"Invalid boolean string: {s}") + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("traj_path", type=str, help="Path to trajectory file") + parser.add_argument("--output_dir", type=str, help="Output directory for action demos", default="./demos") + parser.add_argument("--suffix", type=str, help="Suffix for the output file", default="") + parser.add_argument("--overwrite", type=string2bool, help="Overwrite existing files", default=False, nargs='?') + parser.add_argument("--include_user", type=string2bool, help="Include user responses (computer)", default=False, nargs='?') + args = parser.parse_args() + main(**vars(args)) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..f1bfca4d8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ['setuptools>=42'] +build-backend = 'setuptools.build_meta' diff --git a/run.py b/run.py new file mode 100644 index 000000000..2d1640819 --- /dev/null +++ b/run.py @@ -0,0 +1,223 @@ +import json +import logging +import os +import re +import traceback +import yaml + +from dataclasses import dataclass +from getpass import getuser +from pathlib import Path +from rich.logging import RichHandler +from simple_parsing import parse +from simple_parsing.helpers import FrozenSerializable, FlattenedAccess +from sweagent import ( + Agent, + AgentArguments, + EnvironmentArguments, + ModelArguments, + SWEEnv, + get_data_path_name, +) +from swebench import KEY_INSTANCE_ID, KEY_MODEL, KEY_PREDICTION +from unidiff import PatchSet + +handler = RichHandler(show_time=False, show_path=False) +handler.setLevel(logging.DEBUG) +logger = logging.getLogger("run_dev") +logger.setLevel(logging.DEBUG) +logger.addHandler(handler) +logger.propagate = False +logging.getLogger("simple_parsing").setLevel(logging.WARNING) + + +@dataclass(frozen=True) +class ScriptArguments(FlattenedAccess, FrozenSerializable): + environment: EnvironmentArguments + agent: AgentArguments + instance_filter: str = ".*" # Only run instances that completely match this regex + skip_existing: bool = True # Skip instances with existing trajectories + suffix: str = "" + + @property + def run_name(self): + """Generate a unique name for this run based on the arguments.""" + model_name = args.agent.model.model_name + data_stem = get_data_path_name(args.environment.data_path) + config_stem = Path(args.agent.config_file).stem + + temp = args.agent.model.temperature + top_p = args.agent.model.top_p + + per_instance_cost_limit = args.agent.model.per_instance_cost_limit + install_env = args.environment.install_environment + + return ( + f"{model_name}__{data_stem}__{config_stem}__t-{temp:.2f}__p-{top_p:.2f}" + + f"__c-{per_instance_cost_limit:.2f}__install-{int(install_env)}" + + (f"__{self.suffix}" if self.suffix else "") + ) + + +def main(args: ScriptArguments): + logger.info(f"๐Ÿ“™ Arguments: {args.dumps_yaml()}") + agent = Agent("primary", args.agent) + + env = SWEEnv(args.environment) + + traj_dir = Path("trajectories") / Path(getuser()) / args.run_name + os.makedirs(traj_dir, exist_ok=True) + + save_arguments(traj_dir, args) + + for index in range(len(env.data)): + try: + # Reset environment + instance_id = env.data[index]["instance_id"] + if should_skip(args, traj_dir, instance_id): + continue + logger.info("โ–ถ๏ธ Beginning task " + str(index)) + + observation, info = env.reset(index) + if info is None: + continue + + # Get info, patch information + issue = getattr(env, "query", None) + files = [] + if "patch" in env.record: + files = "\n".join( + [f"- {x.path}" for x in PatchSet(env.record["patch"]).modified_files] + ) + # Get test files, F2P tests information + test_files = [] + if "test_patch" in env.record: + test_patch_obj = PatchSet(env.record["test_patch"]) + test_files = "\n".join( + [f"- {x.path}" for x in test_patch_obj.modified_files + test_patch_obj.added_files] + ) + tests = "" + if "FAIL_TO_PASS" in env.record: + tests = "\n".join([f"- {x}" for x in env.record["FAIL_TO_PASS"]]) + + setup_args = { + "issue": issue, + "files": files, + "test_files": test_files, + "tests": tests + } + info = agent.run( + setup_args=setup_args, + env=env, + observation=observation, + traj_dir=traj_dir, + return_type="info", + ) + save_predictions(traj_dir, instance_id, info) + + except KeyboardInterrupt: + logger.info("Exiting InterCode environment...") + env.close() + break + except Exception as e: + traceback.print_exc() + logger.warning(f"โŒ Failed on {env.record['instance_id']}: {e}") + env.reset_container() + continue + + +def save_arguments(traj_dir, args): + """Save the arguments to a yaml file to the run's trajectory directory.""" + log_path = traj_dir / "args.yaml" + + if log_path.exists(): + try: + other_args = args.load_yaml(log_path) + if (args.dumps_yaml() != other_args.dumps_yaml()): # check yaml equality instead of object equality + logger.warning("**************************************************") + logger.warning("Found existing args.yaml with different arguments!") + logger.warning("**************************************************") + except Exception as e: + logger.warning(f"Failed to load existing args.yaml: {e}") + + with log_path.open("w") as f: + args.dump_yaml(f) + + +def should_skip(args, traj_dir, instance_id): + """Check if we should skip this instance based on the instance filter and skip_existing flag.""" + # Skip instances that don't match the instance filter + if re.match(args.instance_filter, instance_id) is None: + logger.info(f"Instance filter not matched. Skipping instance {instance_id}") + return True + + # If flag is set to False, don't skip + if not args.skip_existing: + return False + + # Check if there's an existing trajectory for this instance + log_path = traj_dir / (instance_id + ".traj") + if log_path.exists(): + with log_path.open("r") as f: + data = json.load(f) + # If the trajectory has no exit status, it's incomplete and we will redo it + exit_status = data["info"].get("exit_status", None) + if exit_status == "early_exit" or exit_status is None: + logger.info(f"Found existing trajectory with no exit status: {log_path}") + logger.info("Removing incomplete trajectory...") + os.remove(log_path) + else: + logger.info(f"โญ๏ธ Skipping existing trajectory: {log_path}") + return True + return False + + +def save_predictions(traj_dir, instance_id, info): + output_file = Path(traj_dir) / "all_preds.jsonl" + model_patch = info["submission"] if "submission" in info else None + datum = { + KEY_MODEL: Path(traj_dir).name, + KEY_INSTANCE_ID: instance_id, + KEY_PREDICTION: model_patch, + } + with open(output_file, "a+") as fp: + print(json.dumps(datum), file=fp, flush=True) + logger.info(f"Saved predictions to {output_file}") + + +if __name__ == "__main__": + defaults = ScriptArguments( + suffix="", + environment=EnvironmentArguments( + image_name="swe-agent", + data_path="princeton-nlp/SWE-bench_Lite", + split="dev", + verbose=True, + install_environment=True, + ), + skip_existing=True, + agent=AgentArguments( + model=ModelArguments( + model_name="gpt4", + total_cost_limit=0.0, + per_instance_cost_limit=2.0, + temperature=0.2, + top_p=0.95, + ), + config_file="config/default.yaml", + ), + ) + + # Nicer yaml dumping of multiline strings + def multiline_representer(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, multiline_representer) + + args = parse(ScriptArguments, default=defaults, add_config_path_arg=False) + main(args) diff --git a/run_replay.py b/run_replay.py new file mode 100644 index 000000000..1f402ae39 --- /dev/null +++ b/run_replay.py @@ -0,0 +1,142 @@ +import json +import os +import subprocess +import yaml + +from argparse import ArgumentParser + + +def process_synthetic_trajs(action_trajs_path: str, config_file: str, suffix: str): + # Load action trajectories, task instances + action_trajs = [json.loads(x) for x in open(action_trajs_path, "r").readlines()] + task_instances = [x["task_instance"] for x in action_trajs] + file_name = action_trajs_path.rsplit("/", 1)[-1] + + # Temporary file names + replay_action_trajs_path = "temp_actions.jsonl" + replay_task_instances_path = file_name + + # Write task_instances to file for data_path + with open(replay_task_instances_path, "w") as f: + for t in task_instances: + print(json.dumps(t), file=f, end="\n", flush=True) + + # Write action trajectories to a file + with open(replay_action_trajs_path, "w") as f: + for t in action_trajs: + print( + json.dumps({t["task_instance"]["instance_id"]: t["actions"]}), + file=f, + end="\n", + flush=True, + ) + + # Call run.py via subprocess + command = [ + "python", + "run.py", + "--config_file", config_file, + "--data_path", replay_task_instances_path, + "--install_environment", "True", + "--model_name", "replay", + "--replay_path", replay_action_trajs_path + ] + if suffix is not None: + command.extend(["--suffix", suffix]) + + subprocess.run(command) + + os.remove(replay_action_trajs_path) + os.remove(replay_task_instances_path) + + +def process_single_traj(traj_path: str, config_file: str, data_path: str, suffix: str): + replay_action_trajs_path = "temp_replay.jsonl" + + # Open trajectory file, extract responses as actions + if traj_path.endswith(".yaml"): + traj_data = dict() + with open(traj_path, "r") as f: + traj_data["history"] = yaml.safe_load(f) + else: + traj_data = json.load(open(traj_path, "r")) + actions = [x["content"] for x in traj_data["history"] if x["role"] == "assistant"] + instance_id = traj_path.split("/")[-1].split(".")[0] + with open(replay_action_trajs_path, "w") as f: + print( + json.dumps({instance_id: actions}), + file=f, + end="\n", + flush=True + ) + replay_task_instances_path = instance_id + ".jsonl" + + # Get data_path from args.yaml + if data_path is None: + args_path = os.path.join( + os.path.dirname(traj_path), + "args.yaml" + ) + args = yaml.safe_load(open(args_path)) + data_path = args['environment']['data_path'] + + # Identify the relevant task instance and create it + data = None + if data_path.endswith(".jsonl"): + data = [json.loads(x) for x in open(data_path, "r").readlines()] + elif data_path.endswith(".json"): + data = json.load(open(data_path)) + else: + raise ValueError("--data_path must be a .json or .jsonl") + data = [d for d in data if d["instance_id"] == instance_id] + + with open(replay_task_instances_path, "w") as f: + for d in data: + print(json.dumps(d), file=f, end="\n", flush=True) + + # Call run.py via subprocess + command = [ + "python", + "run.py", + "--config_file", config_file, + "--data_path", replay_task_instances_path, + "--install_environment", "True", + "--model_name", "replay", + "--replay_path", replay_action_trajs_path, + ] + if suffix is not None: + command.extend(["--suffix", suffix]) + subprocess.run(command) + + os.remove(replay_action_trajs_path) + os.remove(replay_task_instances_path) + + +def main( + action_trajs_path: str, + traj_path: str, + config_file: str, + data_path: str, + suffix: str, +): + if action_trajs_path is not None: + process_synthetic_trajs(action_trajs_path, config_file, suffix) + elif traj_path is not None: + process_single_traj(traj_path, config_file, data_path, suffix) + else: + print( + "No replays generated.\n" + "You must either provide one of the following. Either...\n" + "\t* --action_trajs_path for replaying synthetic trajectories\n" + "\t* --traj_path for replaying SWE-agent style trajectories (from ./trajectories folder)\n" + ) + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--action_trajs_path", help="Path to action trajectories to replay", default=None) + parser.add_argument("--traj_path", help="Path to trajectory to replay", default=None) + parser.add_argument("--config_file", help="Path to template") + parser.add_argument("--data_path", help="(Optional) Path to data file containing task instances ref'ed by replay trajectories", default=None) + parser.add_argument("--suffix", help="(Optional) Suffix argument appended to end of traj path", default=None) + args = parser.parse_args() + main(**vars(args)) diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 000000000..406094051 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,65 @@ +# Scripts + +This README contains documentation for the main inference script `run.sh` along with some miscellaneous scripts that may be helpful. + +> โš ๏ธ These scripts have been written to be invoked from the root of this codebase (i.e. `./scripts/run.sh`). + +## ๐Ÿƒ Inference Script +The `./run.sh` script has been provided as an example of how to invoke `run.py`. + +A single `run.py` call will generate a `trajectory//` folder containing the trajectories and predictions generated by a `` model run on every instance in the `` dataset. + +The following is a comprehensive guide to using the provided `run.py` script, detailing available command-line arguments, their purposes, and default values. Flags that you might find helpful have been marked with a ๐Ÿ’ก. + +The code and explanation of the implementations for *configuration based workflows* are explained in [`agent/`](../agent/README.md). + +Run `python run.py --help` to view this documentation on the command line. + +#### Optional Arguments +* `-h, --help`: Show the help message and exit. + +#### Script Arguments +These arguments configure the script's behavior: +* `--instance_filter ` ๐Ÿ’ก: Run instances that match this regex pattern. Default is .*. +* `--noskip_existing, --skip_existing,`: [Do not] skip instances that have been completed before. +* `--suffix `: Appends a suffix to the name of the folder containing the trajectories for an experiment run. + +#### Environment Arguments +These arguments are related to the environment configuration: +* `--data_path ` ๐Ÿ’ก: Path to the data file -or- a GitHub issue URL. +* `--base_commit `: You can specify the base commit sha to checkout. This is determined automatically for instances in SWE-bench. +* `--image_name `: Name of the Docker image to use. Default is swe-agent. +* `--noinstall_environment, --install_environment`: [Do not] install the environment. Default is True. +* `--noverbose, --verbose`: Enable verbose output. Default is False. +* `--timeout `: Timeout in seconds. Default is 35. +* `--container_name ` ๐Ÿ’ก: Name of the Docker container if you would like to create a persistent container. Optional. +> โš ๏ธ If you specify a container name, do not run multiple instances of `run.py` with the same container name! + +#### AgentArguments +Configure agent behavior: +* `--config_file ` ๐Ÿ’ก: Path to the configuration YAML file. Default is config/default.yaml. + +#### ModelArguments +Configure model parameters: +* `--model_name ` ๐Ÿ’ก: Name of the model. Default is `gpt4`. +* `--per_instance_cost_limit ` ๐Ÿ’ก: Per-instance cost limit (interactive loop will automatically terminate when cost limit is hit). Default is 2.0. +* `--temperature ` ๐Ÿ’ก: Model temperature. Default is 0.2. +* `--top_p ` ๐Ÿ’ก: Top p filtering. Default is 0.95. +* `--total_cost_limit `: Total cost limit. Default is 0.0. + +### ๐Ÿ“™ Example Usage +Run with custom data path and verbose mode: +```bash +python run.py --data_path /path/to/data.json --verbose +``` + +Specify a model and adjust the temperature and top_p parameters: +```bash +python run.py --model_name gpt4 --temperature 0.5 --top_p 0.8 +``` + +## ๐Ÿ› ๏ธ Miscellaneous Scripts +- `remove_all_containers.sh`: Forcibly removes all Docker containers currently present on the system. +- `run_and_eval.sh`: Runs SWE-agent inference and evaluation on a specified dataset N times. You can specify the `dataset_path`, `num_runs`, `template`, and `suffix` arguments. +- `run_jsonl.sh`: Run SWE-agent inference from a `.jsonl` file that contains a SWE-bench style task instance. +- `run_replay.sh`: Run SWE-agent inference from a `.traj` file. This is useful for automatically creating a new demonstration for a new config from an existing sequence of actions. diff --git a/scripts/remove_all_containers.sh b/scripts/remove_all_containers.sh new file mode 100755 index 000000000..fdb5793b3 --- /dev/null +++ b/scripts/remove_all_containers.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Remove all docker containers + +docker rm -f $(docker ps -aq) diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100755 index 000000000..8d89d5bd0 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1,5 @@ +python run.py \ + --model_name human \ + --data_path ./data/dev-easy/swe-bench-dev-easy.json \ + --per_instance_cost_limit 2.00 \ + --config_file ./config/default.yaml \ No newline at end of file diff --git a/scripts/run_and_eval.sh b/scripts/run_and_eval.sh new file mode 100644 index 000000000..c2cec4aaa --- /dev/null +++ b/scripts/run_and_eval.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#this script runs and evaluates the agent N times. +#to run: +#bash run_and_eval.sh '' default_with_inclusive_edit_demo_v2 data/dev-easy/swe-bench-dev-easy-med.json 3 +# vars: suffix template data number of runs + +# define user variables +suffix=${1:-''} +template=$2 +dataset_path=$3 +num_runs=$4 + +# extract filename from the dataset path +dataset_name=`basename $dataset_path` + +for((i=1; i<=num_runs; i++)); do + # command 1 + python run.py --model_name gpt4 --data_path $dataset_path --config_file config/configs/$template.yaml --suffix ${suffix}run${i} --temperature 0.2 --top_p 0.95 --per_instance_cost_limit 3.00 --install_environment 1 + + # command 2 + python evaluation/evaluation.py \ + --predictions_path trajectories/$USER/gpt4__${dataset_name}__$template__t-0.20__p-0.95__c-3.00__install-1__${suffix}run${i}/all_preds.jsonl \ + --swe_bench_tasks $dataset_path \ + --log_dir ./results \ + --testbed ./testbed \ + --skip_existing \ + --timeout 900 \ + --verbose +done diff --git a/scripts/run_from_url.sh b/scripts/run_from_url.sh new file mode 100755 index 000000000..27c90ca2e --- /dev/null +++ b/scripts/run_from_url.sh @@ -0,0 +1,6 @@ +python run.py \ + --model_name human \ + --data_path https://github.com/psf/requests/issues/6254 \ + --base_commit ac3be98b19f4d09c6a970b271a3ae30f3d0858f7 \ + --per_instance_cost_limit 2.00 \ + --config_file ./config/default_from_url.yaml \ No newline at end of file diff --git a/scripts/run_jsonl.sh b/scripts/run_jsonl.sh new file mode 100755 index 000000000..f681a28a3 --- /dev/null +++ b/scripts/run_jsonl.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Check if an argument was provided +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +FILE="$1" + +# Check if the file exists and is readable +if [ ! -f "$FILE" ] || [ ! -r "$FILE" ]; then + echo "Error: File '$FILE' does not exist or is not readable." + exit 2 +fi + +# Iterate over each line of the JSONL file +while IFS= read -r line; do + # Construct command arguments from the JSON map + # jq -r '. | to_entries | .[] | "--\(.key) \(.value)"' converts each key-value pair in the JSON object + # into a format suitable for passing to the Python script + # xargs -n 2 groups them back into pairs to handle as arguments correctly + ARGS=$(echo "$line" | jq -r '. | to_entries | .[] | "--\(.key) \(.value)"' | xargs -n 2 echo) + + # Execute the Python script with the constructed arguments + echo $ARGS + python run.py $ARGS +done < "$FILE" diff --git a/scripts/run_replay.sh b/scripts/run_replay.sh new file mode 100755 index 000000000..2bc878ac6 --- /dev/null +++ b/scripts/run_replay.sh @@ -0,0 +1,4 @@ +python run_replay.py \ + --traj_path ./trajectories/johnbyang/-demonstrations/pvlib__pvlib-python-1216.default_test.traj \ + --data_path ./trajectories/johnbyang/-demonstrations/pvlib__pvlib-python-1216.json \ + --config_file config/e2e/default_test.yaml \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 000000000..c0e8d0b47 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[metadata] +version = attr: sweagent.__version__ +license_files = LICENSE diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..32f692b4b --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +import setuptools + +with open('README.md', 'r', encoding='utf-8') as fh: + long_description = fh.read() + +setuptools.setup( + name='sweagent', + author='John Yang', + author_email='byjohnyang@gmail.com', + description='The official SWE-agent package - an open source Agent Computer Interface for running language models as software engineers', + keywords='nlp, agents, code', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://swe-agent.com', + project_urls={ + 'Documentation': 'https://github.com/princeton-nlp/SWE-agent', + 'Bug Reports': 'http://github.com/princeton-nlp/SWE-agent/issues', + 'Source Code': 'http://github.com/princeton-nlp/SWE-agent', + 'Website': 'https://sweagent.com', + }, + packages=setuptools.find_packages(), + classifiers=[ + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3 :: Only', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.9', + install_requires=[ + 'anthropic', + 'config', + 'datasets', + 'docker', + 'gnureadline', + 'gymnasium', + 'numpy', + 'openai>=1.0', + 'pandas', + 'rich', + 'ruamel.yaml', + 'simple-parsing', + 'swebench>=1.0.1', + 'tenacity', + 'together' + 'unidiff', + ], + include_package_data=True, +) diff --git a/setup.sh b/setup.sh new file mode 100755 index 000000000..2778613b9 --- /dev/null +++ b/setup.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Create docker image +echo "Setting up docker image for swe-agent..." +arch=$(uname -m) +if [[ "$arch" == "x86_64" ]]; then + echo "Building the x86 Docker image" + docker build -t swe-agent --build-arg MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-x86_64.sh -f docker/swe.Dockerfile . +elif [[ "$arch" == "aarch64" || "$arch" == "arm64" ]]; then + echo "Ayy, arm64 in the house!" + docker build -t swe-agent --build-arg MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py39_23.11.0-1-Linux-aarch64.sh -f docker/swe.Dockerfile . +else + echo "unknown architecture detected?" + echo $arch + exit 1 +fi + +# build eval.Dockerfile +echo "Setting up docker image for evaluation..." +docker build -t swe-eval -f docker/eval.Dockerfile . + +echo "Done with setup!" diff --git a/sweagent/__init__.py b/sweagent/__init__.py new file mode 100644 index 000000000..2f44a3a36 --- /dev/null +++ b/sweagent/__init__.py @@ -0,0 +1,19 @@ +__version__ = "0.0.1" + +from sweagent.agent.agents import ( + Agent, + AgentArguments, +) + +from sweagent.agent.models import ( + ModelArguments, +) + +from sweagent.environment.swe_env import ( + EnvironmentArguments, + SWEEnv, +) + +from sweagent.environment.utils import ( + get_data_path_name, +) \ No newline at end of file diff --git a/sweagent/agent/README.md b/sweagent/agent/README.md new file mode 100644 index 000000000..c74c1eb47 --- /dev/null +++ b/sweagent/agent/README.md @@ -0,0 +1,46 @@ +# Agents +The `agent` folder contains the logic for handling model inference and facilitating their interaction with `SWEEnv`. +The following documentation describing the purpose and classes of each file. + +#### `agents.py` +This file defines the `Agent` class, which facilitates the interaction between an agent and the environment. The `AgentConfig` and `AgentArguments` data classes compile all arguments into a single file. +- `Agent`: Main class for handling model behavior + interaction with environment + - `__init__`: Sets up model, assistant, configurations, and arguments + - `state_command`: Getter for bash command for extracting env. state + - `setup`: Resets cost stats, initializes system message (+ demonstrations), and returns full list of bash commands to define within environment. + - `forward`: Main inference call to model. + - `forward_model`: Determines appropriate observation template, then makes inference call to model + - `forward_with_format_check`: Invokes `forward_model`, with retry calls to handle blocked or malformed actions. + - `forward_with_error_check`: Wraps `forward_with_format_check` with exception handling. + +#### `commands.py` +This file defines the abstraction for custom commands (non-native functions that are implemented in bash) that agents can invoke in `swe-agent` environment. On top of the abstraction, helper functions to extract commands' documentation and compile `.sh` files into separate `Command` objects are provided. There are also fields for establishing the input/output of each action and control flow of actions via templates. +- `AssistantMetadata`: Defines templates for formatting input/output to sub-assistant calls +- `Command`: Defines fields of a custom command +- `ControlMetadata` (WIP): Defines template fields that format the observations for the next agent `forward` inference call +- `generate_command_docs`: Extracts docstrings from each command to form comprehensive documentation. +- `parse_command_file`: Converts bash file content to separate `Command` objects + +#### `models.py` +This file defines the abstraction for running inference on API models. In addition, the `BaseModel` abstraction also defines a set of cost-related fields for tracking instance-level and total expenses accumulated across a single model run. +- `AnthropicModel`: Handles inference + cost logging for Anthropic Models +- `APIStats`: Cost tracking fields that are updated per model inference +- `BaseModel`: Abstract class that defines the common logic for updating cost stats +- `get_model`: Returns initialized `[Anthropic|Human|OpenAI]Model` based on given arguments + commands +- `HumanModel`: Handles inference for human task worker +- `ModelArguments`: Model name, hyperparameter, and cost limit arguments +- `OpenAIModel`: Handles inference + cost logging for OpenAI models + +#### `parsing.py` +This file defines the abstraction for parsing the output of the model inference. The `Parsing` class is used to extract the relevant information from the model's output and format it into a response that can be used by the `Agent` class. +- `Parsing`: Abstract class that defines the common logic for parsing model output + +#### `history_processors.py` +This file defines the abstraction for processing the history of the environment. The `HistoryProcessor` class is used to extract the relevant information from the history of the environment and format it into a response that can be used by the `Agent` class. +- `HistoryProcessor`: Abstract class that defines the common logic for processing the history of the environment +- `DefaultHistoryProcessor`: Default implementation of `HistoryProcessor` that processes the history of the environment + +### Environment Usage +* To skip over a task instance, use the `skip` keyword +* To submit for evaluation, use the `submit` keyword +* To exit the `SWEEnv` environment, perform a keyboard interrupt (`^ c`) \ No newline at end of file diff --git a/sweagent/agent/__init__.py b/sweagent/agent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sweagent/agent/agents.py b/sweagent/agent/agents.py new file mode 100644 index 000000000..bbe2c89ae --- /dev/null +++ b/sweagent/agent/agents.py @@ -0,0 +1,681 @@ +import json +import re +import logging + +from dataclasses import dataclass +from pathlib import Path +from simple_parsing.helpers import field, FrozenSerializable, FlattenedAccess +from sweagent.agent.commands import Command, ParseCommand +from sweagent.agent.history_processors import HistoryProcessor +from sweagent.agent.models import ( + APIStats, + ContextWindowExceededError, + CostLimitExceededError, + ModelArguments, + get_model, +) +from sweagent.agent.parsing import ParseFunction, FormatError +from sweagent.environment.utils import LOGGER_NAME +from sweagent.environment.swe_env import SWEEnv +from tenacity import RetryError +from typing import Optional, Tuple, Any + +logger = logging.getLogger(LOGGER_NAME) + + +@dataclass(frozen=True) +class Subroutine(FrozenSerializable): + name: str + agent_file: str + return_type: str = None # one of "action", "observation", "response", "state", "thought" + init_observation: Optional[str] = None + end_name: Optional[str] = None + signature: Optional[str] = None + docstring: Optional[str] = None + model: Optional[ModelArguments] = None + agent_args: Optional[Any] = None + + +@dataclass(frozen=True) +class AgentConfig(FrozenSerializable): + system_template: str + instance_template: str + next_step_template: Optional[str] = None # defaults to instance_template + next_step_no_output_template: Optional[str] = None # defaults to next_step_template + strategy_template: Optional[str] = None + demonstration_template: Optional[str] = None + demonstrations: list[str] = field(default_factory=list) + put_demos_in_history: bool = False # if True, add demonstration to history instead of as a single message + format_error_template: str = None # defaults to format_error_template in ParseFunction + command_files: list[str] = field(default_factory=list) + env_variables: dict[str, str] = field(default_factory=dict) + util_functions: list[str] = field(default_factory=list) + submit_command: str = "submit" + parse_function: str = "ThoughtActionParser" + parse_command: str = "ParseCommandBash" + history_processor: str = "DefaultHistoryProcessor" + history_processor_args: dict[str, Any] = field(default_factory=dict) + command_docs: str = None + blocklist_error_template: str = "Interactive operation '{name}' is not supported by this environment" + blocklist: Tuple[str] = ( + "vim", + "vi", + "emacs", + "nano", + "nohup", + "git", + ) + blocklist_standalone: Tuple[str] = ( + "python", + "python3", + "ipython", + "bash", + "sh", + "exit", + "/bin/bash", + "/bin/sh", + "nohup", + "vi", + "vim", + "emacs", + "nano", + ) + # Should extract environment state in a json readable form + state_command: Command = Command( + name="state", + code="""state() { + echo '{"working_dir": "'$(realpath --relative-to=$ROOT/.. $PWD)'"}'; + };""", + ) + _commands: list[Command] = field(default_factory=list) + _subroutines: dict[str, Subroutine] = field(default_factory=dict) + subroutine_types: list[Subroutine] = field(default_factory=list) + + def __post_init__(self): + if self.next_step_template is None: + object.__setattr__(self, "next_step_template", self.instance_template) + if self.next_step_no_output_template is None: + object.__setattr__( + self, "next_step_no_output_template", self.next_step_template + ) + + object.__setattr__(self, "parse_command", ParseCommand.get(self.parse_command)) + for file in self.command_files: + commands = self.parse_command.parse_command_file(file) + + util_functions = [ + command for command in commands if command.name.startswith("_") + ] + commands = [ + command for command in commands if not command.name.startswith("_") + ] + + object.__setattr__( + self, "util_functions", self.util_functions + util_functions + ) + object.__setattr__(self, "_commands", self._commands + commands) + + for subroutine in self.subroutine_types: + if subroutine.name == 'submit': + raise ValueError("Cannot use 'submit' as a subroutine name") + agent_args = AgentArguments( + model=subroutine.model, + config_file=subroutine.agent_file, + ) + object.__setattr__(subroutine, "agent_args", agent_args) + object.__setattr__(self, "_subroutines", {**self._subroutines, subroutine.name: subroutine}) + + multi_line_command_endings = { + command.name: command.end_name + for command in [*self._commands, *self._subroutines.values()] + if command.end_name is not None + } + object.__setattr__(self, "multi_line_command_endings", multi_line_command_endings) + object.__setattr__( + self, + "command_docs", + self.parse_command.generate_command_docs( + self._commands, + self.subroutine_types, + **self.env_variables, + ), + ) + object.__setattr__(self, "parse_function", ParseFunction.get(self.parse_function)) + if self.format_error_template is None: + object.__setattr__( + self, + "format_error_template", + self.parse_function.format_error_template, + ) + object.__setattr__(self, "format_error_template", self.format_error_template.format(**self.__dict__)) + for command in self._commands: + if command.name == self.submit_command: + object.__setattr__(self, "submit_command_end_name", command.end_name) + break + object.__setattr__( + self, "history_processor", + HistoryProcessor.get(self.history_processor, **self.history_processor_args) + ) + + +@dataclass(frozen=True) +class AgentArguments(FlattenedAccess, FrozenSerializable): + model: ModelArguments = None + + # Policy can only be set via config yaml file from command line + config_file: Optional[Path] = None + config: Optional[AgentConfig] = field(default=None, cmd=False) + + def __post_init__(self): + if self.config is None and self.config_file is not None: + # If unassigned, we load the config from the file to store its contents with the overall arguments + config = AgentConfig.load_yaml(self.config_file) + object.__setattr__(self, "config", config) + assert self.config is not None + for subroutine in getattr(self.config, "subroutines", {}).values(): + model_args = getattr(subroutine, "model") + object.__setattr__(model_args, "per_instance_cost_limit", self.model.per_instance_cost_limit) + object.__setattr__(model_args, "total_cost_limit", self.model.total_cost_limit) + + +class Agent: + """Agent handles the behaviour of the model and how it interacts with the environment.""" + + def __init__(self, name: str, args: AgentArguments): + self.name = name + self.model = get_model(args.model, args.config._commands + args.config.subroutine_types) + self.config = args.config + self.system_args = { + "command_docs": self.config.command_docs, + **self.config.env_variables, + } + self.instance_args = None + self._parse_command_patterns() + self.history = [] + self.last_container_id = None + + def setup(self, instance_args, init_model_stats=None) -> None: + """Setup the agent for a new instance.""" + self.model.reset_stats(init_model_stats) + self.instance_args = instance_args + + system_msg = self.config.system_template.format(**self.system_args) + logger.info(f"SYSTEM ({self.name})\n{system_msg}") + + self.history = [ + {"role": "system", "content": system_msg, "agent": self.name}, + ] + + if len(self.config.demonstrations) > 0 and "history_to_messages" in dir( + self.model + ): + for demonstration_path in self.config.demonstrations: + if self.config.demonstration_template is None and not self.config.put_demos_in_history: + raise ValueError("Cannot use demonstrations without a demonstration template or put_demos_in_history=True") + + # Load history + logger.info(f"DEMONSTRATION: {demonstration_path}") + demo_history = json.load(open(demonstration_path, "r"))["history"] + demo_history = [ + entry for entry in demo_history + if ("agent" not in entry) or + ("agent" in entry and entry["agent"] == self.name) + ] + + if self.config.put_demos_in_history: + if self.config.demonstration_template is not None: + logger.warning("Demonstration template is ignored for put_demos_in_history=True") + # Add demonstration to history directly as separate messages + for entry in demo_history: + if entry["role"] != "system": + entry["is_demo"] = True + self.history.append(entry) + else: + # Add demonstration as single message to history + demo_message = self.model.history_to_messages( + demo_history, + is_demonstration=True, + ) + demonstration = self.config.demonstration_template.format( + **{"demonstration": demo_message} + ) + self.history.append({ + "agent": self.name, + "content": demonstration, + "is_demo": True, + "role": "user", + }) + + @property + def state_command(self) -> str: + """Return the bash command that will be used to extract the environment state.""" + return self.config.state_command.name + + @property + def local_history(self) -> list[dict[str, str]]: + """Return the history of the agent since the last reset.""" + return self.config.history_processor([entry for entry in self.history if entry["agent"] == self.name]) + + def save_trajectory(self, trajectory, traj_dir, env, info): + log_path = traj_dir / (env.record['instance_id'] + ".traj") + log_dict = { + "environment": env.name, + "trajectory": trajectory, + "history": self.history, + "info": info, + } + with log_path.open("w") as f: + json.dump(log_dict, f, indent=2) + logger.info(f"Saved trajectory to {log_path}") + + def _get_first_match(self, action: str, pattern_type: str) -> Optional[re.Match]: + """Return the first match of a command pattern in the action string.""" + if pattern_type == "subroutine": + patterns = {k: v for k, v in self.subroutine_patterns.items()} + elif pattern_type == "multi_line": + patterns = {k: v for k, v in self.command_patterns.items() if k in self.config.multi_line_command_endings or k == self.config.submit_command} + patterns += {k: v for k, v in self.subroutine_patterns.items() if k in self.config.multi_line_command_endings} + elif pattern_type == "multi_line_no_subroutines": + patterns = {k: v for k, v in self.command_patterns.items() if k in self.config.multi_line_command_endings} + else: + raise ValueError(f"Unknown pattern type: {pattern_type}") + matches = list() + for name, pat in patterns.items(): + match = pat.search(action) + if match: + matches.append(match) + if len(matches) == 0: + return None + matches = sorted(matches, key=lambda x: x.start()) + return matches[0] + + def _guard_multiline_input(self, action: str) -> str: + """Split action by multiline commands, then append the first line in each multiline command with "<< '{end_name}'". + Multiline commands (which are specified by an end_name) are commands that span multiple lines and are terminated by a specific end_name. + + Their multi-line argument is sent using a heredoc, which is a way to send a multi-line string to a command in bash. + """ + parsed_action = list() + rem_action = action + while rem_action.strip(): + first_match = self._get_first_match(rem_action, "multi_line_no_subroutines") + if first_match: + pre_action = rem_action[:first_match.start()] + match_action = rem_action[first_match.start():first_match.end()] + rem_action = rem_action[first_match.end():] + if pre_action.strip(): + parsed_action.append(pre_action) + if match_action.strip(): + eof = first_match.group(3).strip() + if not match_action.split('\n')[0].strip().endswith(f"<< '{eof}'"): + guarded_command = match_action[first_match.start():] + first_line = guarded_command.split('\n')[0] + guarded_command = guarded_command.replace( + first_line, + first_line + f" << '{eof}'", + 1 + ) + parsed_action.append(guarded_command) + else: + parsed_action.append(match_action) + else: + parsed_action.append(rem_action) + rem_action = "" + return '\n'.join(parsed_action) + + def split_actions(self, action: str, pattern_type="subroutine") -> list[str]: + """Split an action into a list of actions in a greedy manner, each of which is a subroutine call or a single command.""" + parsed_action = list() + rem_action = action + while rem_action.strip(): + first_match = self._get_first_match(rem_action, pattern_type) + if first_match: + pre_action = rem_action[:first_match.start()] + match_action = rem_action[first_match.start():first_match.end()] + rem_action = rem_action[first_match.end():] + if pre_action.strip(): + parsed_action.append({'agent': self.name, 'action': pre_action, 'cmd_name': None}) + if match_action.strip(): + if match_action.split()[0] == self.config.submit_command: + parsed_action.append({'agent': self.name, 'action': match_action, 'cmd_name': first_match.group(1)}) # submit command is not a subroutine + else: + parsed_action.append({'agent': first_match.group(1), 'args': first_match.group(2), 'action': match_action, 'cmd_name': first_match.group(1)}) + else: + parsed_action.append({'agent': self.name, 'action': rem_action, 'cmd_name': None}) + rem_action = "" + return parsed_action + + def _parse_command_patterns(self): + self.command_patterns = dict() + for command in self.config._commands: + if command.end_name is not None: + pat = re.compile(fr'^\s*({command.name})\s*(.*?)^({command.end_name})\s*$', re.DOTALL | re.MULTILINE) + self.command_patterns[command.name] = pat + else: + pat = re.compile(fr'^\s*({command.name})\s*(.*?)$', re.MULTILINE) + self.command_patterns[command.name] = pat + self.subroutine_patterns = dict() + for _, subroutine in self.config._subroutines.items(): + if subroutine.end_name is None: + pat = re.compile(fr'^\s*({subroutine.name})\s*(.*?)$', re.MULTILINE) + self.subroutine_patterns[subroutine.name,] = pat + else: + pat = re.compile(fr'^\s*({subroutine.name})\s*(.*?)^({subroutine.end_name})\s*$', re.DOTALL | re.MULTILINE) + self.subroutine_patterns[subroutine.name] = pat + if hasattr(self.config, 'submit_command_end_name'): + submit_pat = re.compile(rf'^\s*({self.config.submit_command})\s*(.*?)^({self.config.submit_command_end_name})\s*$', re.DOTALL | re.MULTILINE) + else: + submit_pat = re.compile(rf'^\s*({self.config.submit_command})(\s*)$', re.MULTILINE) # group 2 is nothing + self.subroutine_patterns[self.config.submit_command] = submit_pat + self.command_patterns[self.config.submit_command] = submit_pat + + def forward(self, observation: str, available_actions: list[str], state: str) -> Tuple[str, str, str]: + thought, action, output = self.forward_with_error_check(observation, state) + + self.history.append( + {"role": "assistant", + "content": output, + "thought": thought, + "action": action, + "agent": self.name, + } + ) + + logger.info(f"๐Ÿ’ญ THOUGHT ({self.name})\n{thought}") + logger.info(f"๐ŸŽฌ ACTION ({self.name})\n{action}") + + return thought, action, output + + def forward_model(self, observation: str, state: str) -> str: + """Query the model with the current state and observation with the appropriate template. + + Returns the model output.""" + + state_vars = json.loads(state) + + templates = [] + # Determine observation template based on what prior observation was + if self.history[-1]["role"] == "system" or self.history[-1].get("is_demo", False): + # Show instance template if prev. obs. was initial system message + templates = [self.config.instance_template] + if self.config.strategy_template is not None: + templates.append(self.config.strategy_template) + elif observation is None or observation.strip() == "": + # Show no output template if observation content was empty + templates = [self.config.next_step_no_output_template] + else: + # Show standard output template if there is observation content + templates = [self.config.next_step_template] + + # Populate selected template(s) with information (e.g., issue, arguments, state) + messages = [] + for template in templates: + messages.append( + template.format( + **self.instance_args, + **self.system_args, + **state_vars, + observation=(observation if observation is not None else ""), + ) + ) + + message = "\n".join(messages) + + logger.info(f"๐Ÿค– MODEL INPUT\n{message}") + self.history.append({"role": "user", "content": message, "agent": self.name}) + + return self.model.query(self.local_history) + + def retry_after_format_fail(self, output): + """Ask the model to correct (without committing to persistent history) after a malformatted model output""" + format_error_template = self.config.format_error_template + + logger.warning(f"MALFORMED OUTPUT\n{output}") + logger.warning(f"FORMAT ERROR\n{format_error_template}") + + temp_history = self.local_history + [ + {"role": "assistant", "content": output, "agent": self.name}, + {"role": "user", "content": format_error_template, "agent": self.name}, + ] + return self.model.query(temp_history) + + def retry_after_blocklist_fail(self, output, action): + """Ask the model to correct (without committing to persistent history) after a disallowed command""" + name = action.strip().split()[0] + blocklist_error_message = self.config.blocklist_error_template.format(name=name) + + logger.warning(f"BLOCKLISTED OUTPUT\n{output}") + logger.warning(f"BLOCKLIST ERROR\n{blocklist_error_message}") + + temp_history = self.local_history + [ + {"role": "assistant", "content": output, "agent": self.name}, + {"role": "user", "content": blocklist_error_message, "agent": self.name}, + ] + return self.model.query(temp_history) + + def should_block_action(self, action): + """Check if the command should be blocked.""" + names = action.strip().split() + if len(names) == 0: + return False + name = names[0] + if name in self.config.blocklist: + return True + if name in self.config.blocklist_standalone and name == action.strip(): + return True + return False + + def check_format_and_requery( + self, output: str, + ) -> Tuple[str, str, str]: + """Query the model with the current state and observation with the appropriate template. + + Try to parse the output into a thought and action. Retry if the output is malformatted or the action is blocked. + + Returns the thought, action, and raw model output. + """ + # Condition for handling outputs with no thought (just action) + if self.model.args.model_name == "human": + return "", output, output + elif self.model.args.model_name == "human_thought": + thought, action = ParseFunction.get("ThoughtActionParser")( + output, + self.config._commands + self.config.subroutine_types, + strict=False, + ) + return thought, action, output + + format_fails = blocklist_fails = 0 + + while format_fails + blocklist_fails <= 2: + try: + thought, action = self.config.parse_function( + output, + self.config._commands + self.config.subroutine_types, + strict=False, + ) + except KeyboardInterrupt: + raise + except FormatError as e: + format_fails += 1 + output = self.retry_after_format_fail(output) + continue + if self.should_block_action(action): + blocklist_fails += 1 + output = self.retry_after_blocklist_fail(output, action) + else: + return thought, action, output + logger.warning(f"Malformat limit reached: \n{output}") + return "Exit due to format error", "exit_format", output + + def forward_with_error_check(self, observation: str, state: str) -> Tuple[str, str, str]: + try: + output = self.forward_model(observation, state) + except KeyboardInterrupt: + raise + except RuntimeError as e: + logger.warning(f"Runtime error: {e}") + return ( + f"Exit due to runtime error: {e}", + "exit_error", + f"exit due to runtime error: {e}", + ) + except ContextWindowExceededError as e: + logger.warning(f"Context window exceeded") + return "Exit due to context window", "exit_context", "Exit due to context window" + except CostLimitExceededError as e: + logger.warning(f"Cost limit exceeded") + return "Exit due to cost limit", "exit_cost", "Exit due to cost limit" + except RetryError as e: + logger.warning(f"Retry error: {e}") + return ( + f"Exit due to retry error: {e}", + "exit_api", + f"exit due to retry error: {e}", + ) + return self.check_format_and_requery(output) + + def init_environment_vars(self, env): + self.set_environment_vars(env, self.config.env_variables) + + def set_environment_vars(self, env, env_variables): + commands_to_execute = ( + [self.config.state_command.code] + + # [code for code in self.config.util_functions] + + # [command.code for command in self.config._commands] + + [f"{k}={v}" for k,v in env_variables.items()] + ) + commands = "\n".join(commands_to_execute) + try: + output = env.communicate(commands) + if env.returncode != 0: + raise RuntimeError(f"Nonzero return code: {env.returncode}\nOutput: {output}") + except KeyboardInterrupt: + raise + except Exception as e: + logger.warning("Failed to set environment variables") + raise e + command_files = list() + for file in self.config.command_files: + datum = dict() + contents = open(file, 'r').read() + datum['contents'] = contents + filename = Path(file).name + if not contents.strip().startswith('#!'): + if filename.endswith('.sh'): + # files are sourced, so they are not executable + datum['name'] = Path(file).name + datum['type'] = 'source_file' + elif filename.startswith('_'): + # files are sourced, so they are not executable + datum['name'] = Path(file).name + datum['type'] = 'utility' + else: + raise ValueError(( + f"Non-shell script file {file} does not start with shebang.\n" + "Either add a shebang (#!) or change the file extension to .sh if you want to source it.\n" + "You can override this behavior by adding an underscore to the file name (e.g. _utils.py)." + )) + else: + # scripts are made executable + datum['name'] = Path(file).name.rsplit('.', 1)[0] + datum['type'] = 'script' + command_files.append(datum) + env.add_commands(command_files) + + def get_environment_vars(self, env): + env_vars = dict() + for var in self.config.env_variables: + env_vars[var] = env.communicate(f"echo ${var}").strip() + return env_vars + + def call_subroutine(self, agent_name, sub_action, env): + env_vars = self.get_environment_vars(env) + cwd = env.communicate("pwd -P").strip() + init_observation = self.config._subroutines[agent_name].init_observation + if init_observation is not None: + obs, _, _, _ = env.step(init_observation.format(args=sub_action['args'])) + else: + obs = None + if env.returncode != 0: + self.history.append({"role": "user", "content": obs, "agent": agent_name}) + raise RuntimeError(f"Nonzero return code: {env.returncode} for init_observation in {agent_name}.\n{obs}") + return_type = self.config._subroutines[agent_name].return_type + sub_agent = Agent(agent_name, self.config._subroutines[agent_name].agent_args) + sub_agent_output = sub_agent.run( + {"issue": sub_action['args']}, + env, + observation=obs, + return_type=return_type, + init_model_stats=self.model.stats, + ) + self.history += sub_agent.history + self.set_environment_vars(env, env_vars) + env.communicate(f"cd {cwd}") + self.model.stats.replace(sub_agent.model.stats) + return sub_agent_output + + def run( + self, + setup_args, + env: SWEEnv, + observation: str = None, + traj_dir: Optional[Path] = None, + return_type: Optional[str] = "info", + init_model_stats: Optional[APIStats] = None, + ): + """ + Run the agent on an environment. + Return the final value of the specified return type. + """ + done = False + + if env.container_obj.id != self.last_container_id: + logger.info(f"Initializing agent settings for container {env.container_obj.id}") + self.init_environment_vars(env) + self.last_container_id = env.container_obj.id + # Re-initialize primary + self.setup(setup_args, init_model_stats) + + # Run action/observation loop + trajectory = [] + info = {} + while not done: + state = env.communicate(self.state_command) if self.state_command else None + thought, action, output = self.forward( + observation, + env.get_available_actions(), + state) + observations = list() + run_action = self._guard_multiline_input(action) + for sub_action in self.split_actions(run_action): + if sub_action['agent'] == self.name or sub_action['cmd_name'] == self.config.submit_command: + obs, _, done, info = env.step(sub_action['action']) + observations.append(obs) + if sub_action['cmd_name'] == self.config.submit_command: + done = True + if done: + break + else: + agent_name = sub_action['agent'] + sub_agent_output = self.call_subroutine(agent_name, sub_action, env) + observations.append(sub_agent_output) + + observation = '\n'.join([obs for obs in observations if obs is not None]) + + trajectory.append( + { + "action": action, + "observation": observation, + "response": output, + "state": state, + "thought": thought, + } + ) + info['model_stats'] = self.model.stats.to_dict() + if traj_dir: + self.save_trajectory(trajectory, traj_dir, env, info) + if return_type != "info": + return trajectory[-1][return_type] + else: + return info diff --git a/sweagent/agent/commands.py b/sweagent/agent/commands.py new file mode 100644 index 000000000..be808485d --- /dev/null +++ b/sweagent/agent/commands.py @@ -0,0 +1,246 @@ +import re +import yaml + +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional +from simple_parsing.helpers import FrozenSerializable + + +@dataclass(frozen=True) +class AssistantMetadata(FrozenSerializable): + """Pass observations to the assistant, and get back a response.""" + system_template: Optional[str] = None + instance_template: Optional[str] = None + + +# TODO: first can be used for two-stage actions +# TODO: eventually might control high-level control flow +@dataclass(frozen=True) +class ControlMetadata(FrozenSerializable): + """TODO: should be able to control high-level control flow after calling this command""" + next_step_template: Optional[str] = None + next_step_action_template: Optional[str] = None + + +@dataclass(frozen=True) +class Command(FrozenSerializable): + code: str + name: str + docstring: Optional[str] = None + end_name: Optional[str] = None # if there is an end_name, then it is a multi-line command + arguments: Optional[Dict] = None + signature: Optional[str] = None + + +class ParseCommandMeta(type): + _registry = {} + + def __new__(cls, name, bases, attrs): + new_cls = super().__new__(cls, name, bases, attrs) + if name != "ParseCommand": + cls._registry[name] = new_cls + return new_cls + + +@dataclass +class ParseCommand(metaclass=ParseCommandMeta): + @classmethod + def get(cls, name): + try: + return cls._registry[name]() + except KeyError: + raise ValueError(f"Command parser ({name}) not found.") + + @abstractmethod + def parse_command_file(self, path: str) -> List[Command]: + """ + Define how to parse a file into a list of commands. + """ + raise NotImplementedError + + @abstractmethod + def generate_command_docs(self, commands: List[Command], subroutine_types, **kwargs) -> str: + """ + Generate a string of documentation for the given commands and subroutine types. + """ + raise NotImplementedError + + +# DEFINE NEW COMMAND PARSER FUNCTIONS BELOW THIS LINE + +class ParseCommandBash(ParseCommand): + def parse_command_file(self, path: str) -> List[Command]: + print('Parsing command file:', path) + contents = open(path, "r").read() + if contents.strip().startswith("#!"): + commands = self.parse_script(path, contents) + else: + if not path.endswith(".sh") and not Path(path).name.startswith("_"): + raise ValueError(( + f"Source file {path} does not have a .sh extension.\n" + "Only .sh files are supported for bash function parsing.\n" + "If you want to use a non-shell file as a command (script), " + "it should use a shebang (e.g. #!/usr/bin/env python)." + )) + return self.parse_bash_functions(path, contents) + if len(commands) == 0 and not Path(path).name.startswith("_"): + raise ValueError(( + f"Non-shell file {path} does not contain any commands.\n" + "If you want to use a non-shell file as a command (script), " + "it should contain exactly one @yaml docstring. " + "If you want to use a file as a utility script, " + "it should start with an underscore (e.g. _utils.py)." + )) + else: + return commands + + def parse_bash_functions(self, path, contents) -> List[Command]: + """ + Simple logic for parsing a bash file and segmenting it into functions. + + Assumes that all functions have their name and opening curly bracket in one line, + and closing curly bracket in a line by itself. + """ + lines = contents.split("\n") + commands = [] + idx = 0 + docs = [] + while idx < len(lines): + line = lines[idx] + idx += 1 + if line.startswith("# "): + docs.append(line[2:]) + elif line.strip().endswith("() {"): + name = line.split()[0][:-2] + code = line + while lines[idx].strip() != "}": + code += lines[idx] + idx += 1 + code += lines[idx] + docstring, end_name, arguments, signature = None, None, None, name + docs_dict = yaml.safe_load("\n".join(docs).replace('@yaml', '')) + if docs_dict is not None: + docstring = docs_dict["docstring"] + end_name = docs_dict.get("end_name", None) + arguments = docs_dict.get("arguments", None) + if "signature" in docs_dict: + signature = docs_dict["signature"] + else: + if arguments is not None: + for param, settings in arguments.items(): + if settings["required"]: + signature += f" <{param}>" + else: + signature += f" [<{param}>]" + command = Command.from_dict({ + "code": code, + "docstring": docstring, + "end_name": end_name, + "name": name, + "arguments": arguments, + "signature": signature + }) + commands.append(command) + docs = [] + return commands + + def parse_script(self, path, contents) -> List[Command]: + pattern = re.compile(r'^#\s*@yaml\s*\n^#.*(?:\n#.*)*', re.MULTILINE) + matches = pattern.findall(contents) + if len(matches) == 0: + return [] + elif len(matches) > 1: + raise ValueError(( + "Non-shell file contains multiple @yaml tags.\n" + "Only one @yaml tag is allowed per script." + )) + else: + yaml_content = matches[0] + yaml_content = re.sub(r'^#', '', yaml_content, flags=re.MULTILINE) + docs_dict = yaml.safe_load(yaml_content.replace('@yaml', '')) + assert docs_dict is not None + docstring = docs_dict["docstring"] + end_name = docs_dict.get("end_name", None) + arguments = docs_dict.get("arguments", None) + signature = docs_dict.get("signature", None) + name = Path(path).name.rsplit(".", 1)[0] + if signature is None and arguments is not None: + signature = name + for param, settings in arguments.items(): + if settings["required"]: + signature += f" <{param}>" + else: + signature += f" [<{param}>]" + code = contents + return [Command.from_dict({ + "code": code, + "docstring": docstring, + "end_name": end_name, + "name": name, + "arguments": arguments, + "signature": signature + })] + + + def generate_command_docs(self, commands: List[Command], subroutine_types, **kwargs) -> str: + docs = "" + for cmd in commands: + if cmd.docstring is not None: + docs += f"{cmd.signature or cmd.name} - {cmd.docstring.format(**kwargs)}\n" + for subroutine in subroutine_types: + if subroutine.docstring is not None: + docs += f"{subroutine.signature or subroutine.name} - {subroutine.docstring.format(**kwargs)}\n" + return docs + + +class ParseCommandDetailed(ParseCommandBash): + """ + # command_name: + # "docstring" + # signature: "signature" + # arguments: + # arg1 (type) [required]: "description" + # arg2 (type) [optional]: "description" + """ + def get_signature(cmd): + signature = cmd.name + if "arguments" in cmd.__dict__ and cmd.arguments is not None: + if cmd.end_name is None: + for param, settings in cmd.arguments.items(): + if settings["required"]: + signature += f" <{param}>" + else: + signature += f" [<{param}>]" + else: + for param, settings in list(cmd.arguments.items())[:-1]: + if settings["required"]: + signature += f" <{param}>" + else: + signature += f" [<{param}>]" + signature += f"\n{list(cmd.arguments[-1].keys())[0]}\n{cmd.end_name}" + return signature + + def generate_command_docs( + self, + commands: List[Command], + subroutine_types, + **kwargs, + ) -> str: + docs = "" + for cmd in commands + subroutine_types: + docs += f"{cmd.name}:\n" + if cmd.docstring is not None: + docs += f" docstring: {cmd.docstring}\n" + if cmd.signature is not None: + docs += f" signature: {cmd.signature}\n" + else: + docs += f" signature: {self.get_signature(cmd)}\n" + if "arguments" in cmd.__dict__ and cmd.arguments is not None: + docs += " arguments:\n" + for param, settings in cmd.arguments.items(): + req_string = "required" if settings["required"] else "optional" + docs += f" - {param} ({settings['type']}) [{req_string}]: {settings['description']}\n" + docs += "\n" + return docs diff --git a/sweagent/agent/history_processors.py b/sweagent/agent/history_processors.py new file mode 100644 index 000000000..f64211984 --- /dev/null +++ b/sweagent/agent/history_processors.py @@ -0,0 +1,123 @@ +import re + +from abc import abstractmethod +from dataclasses import dataclass + + +class FormatError(Exception): + pass + +# ABSTRACT BASE CLASSES + +class HistoryProcessorMeta(type): + _registry = {} + + def __new__(cls, name, bases, attrs): + new_cls = super().__new__(cls, name, bases, attrs) + if name != "HistoryProcessor": + cls._registry[name] = new_cls + return new_cls + + +@dataclass +class HistoryProcessor(metaclass=HistoryProcessorMeta): + def __init__(self, *args, **kwargs): + pass + + @abstractmethod + def __call__(self, history: list[str]) -> list[str]: + raise NotImplementedError + + @classmethod + def get(cls, name, *args, **kwargs): + try: + return cls._registry[name](*args, **kwargs) + except KeyError: + raise ValueError(f"Model output parser ({name}) not found.") + + +# DEFINE NEW PARSING FUNCTIONS BELOW THIS LINE +class DefaultHistoryProcessor(HistoryProcessor): + def __call__(self, history): + return history + + +def last_n_history(history, n): + if n <= 0: + raise ValueError('n must be a positive integer') + new_history = list() + user_messages = len([entry for entry in history if (entry['role'] == 'user' and not entry.get('is_demo', False))]) + user_msg_idx = 0 + for entry in history: + data = entry.copy() + if data['role'] != 'user': + new_history.append(entry) + continue + if data.get('is_demo', False): + new_history.append(entry) + continue + else: + user_msg_idx += 1 + if user_msg_idx == 1 or user_msg_idx in range(user_messages - n + 1, user_messages + 1): + new_history.append(entry) + else: + data['content'] = f'Old output ommitted ({len(entry["content"].splitlines())} lines)' + new_history.append(data) + return new_history + + +class LastNObservations(HistoryProcessor): + def __init__(self, n): + self.n = n + + def __call__(self, history): + return last_n_history(history, self.n) + + +class Last2Observations(HistoryProcessor): + def __call__(self, history): + return last_n_history(history, 2) + + +class Last5Observations(HistoryProcessor): + def __call__(self, history): + return last_n_history(history, 5) + + +class ClosedWindowHistoryProcessor(HistoryProcessor): + pattern = re.compile(r'^(\d+)\:.*?(\n|$)', re.MULTILINE) + file_pattern = re.compile(r'\[File:\s+(.*)\s+\(\d+\s+lines\ total\)\]') + + def __call__(self, history): + new_history = list() + # For each value in history, keep track of which windows have been shown. + # We want to mark windows that should stay open (they're the last window for a particular file) + # Then we'll replace all other windows with a simple summary of the window (i.e. number of lines) + windows = set() + for entry in reversed(history): + data = entry.copy() + if data['role'] != 'user': + new_history.append(entry) + continue + if data.get('is_demo', False): + new_history.append(entry) + continue + matches = list(self.pattern.finditer(entry['content'])) + if len(matches) >= 1: + file_match = self.file_pattern.search(entry['content']) + if file_match: + file = file_match.group(1) + else: + continue + if file in windows: + start = matches[0].start() + end = matches[-1].end() + data['content'] = ( + entry['content'][:start] +\ + f'Outdated window with {len(matches)} lines ommitted...\n' +\ + entry['content'][end:] + ) + windows.add(file) + new_history.append(data) + history = list(reversed(new_history)) + return history \ No newline at end of file diff --git a/sweagent/agent/models.py b/sweagent/agent/models.py new file mode 100644 index 000000000..c99b46bf0 --- /dev/null +++ b/sweagent/agent/models.py @@ -0,0 +1,630 @@ +import config +import json +import logging +import os +import together + +from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT +from dataclasses import dataclass, fields +from openai import BadRequestError, OpenAI +from simple_parsing.helpers import FrozenSerializable, Serializable +from sweagent.agent.commands import Command +from tenacity import ( + retry, + stop_after_attempt, + wait_random_exponential, + retry_if_not_exception_type, +) +from typing import Optional + +logger = logging.getLogger("api_models") + + +@dataclass(frozen=True) +class ModelArguments(FrozenSerializable): + model_name: str + per_instance_cost_limit: float = 0.0 + total_cost_limit: float = 0.0 + temperature: float = 1.0 + top_p: float = 1.0 + replay_path: str = None + + +@dataclass +class APIStats(Serializable): + total_cost: float = 0 + instance_cost: float = 0 + tokens_sent: int = 0 + tokens_received: int = 0 + api_calls: int = 0 + + def __add__(self, other): + if not isinstance(other, APIStats): + raise TypeError("Can only add APIStats with APIStats") + + return APIStats(**{ + field.name: getattr(self, field.name) + getattr(other, field.name) + for field in fields(self) + }) + + def replace(self, other): + if not isinstance(other, APIStats): + raise TypeError("Can only replace APIStats with APIStats") + + return APIStats(**{ + field.name: getattr(other, field.name) + for field in fields(self) + }) + + +class ContextWindowExceededError(Exception): + pass + + +class CostLimitExceededError(Exception): + pass + + +class BaseModel: + MODELS = {} + SHORTCUTS = {} + + def __init__(self, args: ModelArguments, commands: list[Command]): + self.args = args + self.commands = commands + self.model_metadata = {} + self.stats = APIStats() + + # Map `model_name` to API-compatible name `api_model` + self.api_model = ( + self.SHORTCUTS[self.args.model_name] + if self.args.model_name in self.SHORTCUTS + else self.args.model_name + ) + + # Map model name to metadata (cost, context info) + MODELS = { + **{dest: self.MODELS[src] for dest, src in self.SHORTCUTS.items()}, + **self.MODELS, + } + if args.model_name in MODELS: + self.model_metadata = MODELS[args.model_name] + elif args.model_name.startswith("ft:"): + ft_model = args.model_name.split(":")[1] + self.model_metadata = MODELS[ft_model] + else: + raise ValueError(f"Unregistered model ({args.model_name}). Add model name to MODELS metadata to {self.__class__}") + + def reset_stats(self, other: APIStats = None): + if other is None: + self.stats = APIStats(total_cost=self.stats.total_cost) + logger.info("Resetting model stats") + else: + self.stats = other + + def update_stats(self, input_tokens, output_tokens): + """ + Calculates the cost of a response from the openai API. + + Args: + input_tokens (int): The number of tokens in the prompt. + output_tokens (int): The number of tokens in the response. + + Returns: + float: The cost of the response. + """ + # Calculate cost and update cost related fields + cost = ( + self.model_metadata["cost_per_input_token"] * input_tokens + + self.model_metadata["cost_per_output_token"] * output_tokens + ) + self.stats.total_cost += cost + self.stats.instance_cost += cost + self.stats.tokens_sent += input_tokens + self.stats.tokens_received += output_tokens + self.stats.api_calls += 1 + + # Log updated cost values to std. out. + logger.info( + f"input_tokens={input_tokens:_}, " + f"output_tokens={output_tokens:_}, " + f"instance_cost={self.stats.instance_cost:.2f}, " + f"cost={cost:.2f}" + ) + logger.info( + f"total_tokens_sent={self.stats.tokens_sent:_}, " + f"total_tokens_received={self.stats.tokens_received:_}, " + f"total_cost={self.stats.total_cost:.2f}, " + f"total_api_calls={self.stats.api_calls:_}" + ) + + # Check whether total cost or instance cost limits have been exceeded + if ( + self.args.total_cost_limit > 0 + and self.stats.total_cost >= self.args.total_cost_limit + ): + logger.warning( + f"Cost {self.stats.total_cost:.2f} exceeds limit {self.args.total_cost_limit:.2f}" + ) + raise CostLimitExceededError("Total cost limit exceeded") + + if ( + self.args.per_instance_cost_limit > 0 + and self.stats.instance_cost >= self.args.per_instance_cost_limit + ): + logger.warning( + f"Cost {self.stats.instance_cost:.2f} exceeds limit {self.args.per_instance_cost_limit:.2f}" + ) + raise CostLimitExceededError("Instance cost limit exceeded") + return cost + + def query(self, history: list[dict[str, str]]) -> str: + raise NotImplementedError("Use a subclass of BaseModel") + + +class OpenAIModel(BaseModel): + MODELS = { + "gpt-3.5-turbo-0125": { + "max_context": 16_385, + "cost_per_input_token": 5e-07, + "cost_per_output_token": 1.5e-06, + }, + "gpt-3.5-turbo-1106": { + "max_context": 16_385, + "cost_per_input_token": 1.5e-06, + "cost_per_output_token": 2e-06, + }, + "gpt-3.5-turbo-16k-0613": { + "max_context": 16_385, + "cost_per_input_token": 1.5e-06, + "cost_per_output_token": 2e-06, + }, + "gpt-4-32k-0613": { + "max_context": 32_768, + "cost_per_input_token": 6e-05, + "cost_per_output_token": 0.00012, + }, + "gpt-4-0613": { + "max_context": 8_192, + "cost_per_input_token": 3e-05, + "cost_per_output_token": 6e-05, + }, + "gpt-4-1106-preview": { + "max_context": 128_000, + "cost_per_input_token": 1e-05, + "cost_per_output_token": 3e-05, + }, + "gpt-4-0125-preview": { + "max_context": 128_000, + "cost_per_input_token": 1e-05, + "cost_per_output_token": 3e-05, + }, + } + + SHORTCUTS = { + "gpt3": "gpt-3.5-turbo-1106", + "gpt3-legacy": "gpt-3.5-turbo-16k-0613", + "gpt4": "gpt-4-1106-preview", + "gpt4-legacy": "gpt-4-0613", + "gpt4-0125": "gpt-4-0125-preview", + "gpt3-0125": "gpt-3.5-turbo-0125", + } + + def __init__(self, args: ModelArguments, commands: list[Command]): + super().__init__(args, commands) + + # Set OpenAI key + cfg = config.Config(os.path.join(os.getcwd(), "keys.cfg")) + self.client = OpenAI(api_key=cfg["OPENAI_API_KEY"]) + + def history_to_messages( + self, history: list[dict[str, str]], is_demonstration: bool = False + ) -> list[dict[str, str]]: + """ + Create `messages` by filtering out all keys except for role/content per `history` turn + """ + # Remove system messages if it is a demonstration + if is_demonstration: + history = [entry for entry in history if entry["role"] != "system"] + return '\n'.join([entry["content"] for entry in history]) + # Return history components with just role, content fields + return [ + {k: v for k, v in entry.items() if k in ["role", "content"]} + for entry in history + ] + + @retry( + wait=wait_random_exponential(min=1, max=15), + reraise=True, + stop=stop_after_attempt(3), + retry=retry_if_not_exception_type((CostLimitExceededError, RuntimeError)), + ) + def query(self, history: list[dict[str, str]]) -> str: + """ + Query the OpenAI API with the given `history` and return the response. + """ + try: + # Perform OpenAI API call + response = self.client.chat.completions.create( + messages=self.history_to_messages(history), + model=self.api_model, + temperature=self.args.temperature, + top_p=self.args.top_p, + ) + except BadRequestError as e: + raise CostLimitExceededError(f"Context window ({self.model_metadata['max_context']} tokens) exceeded") + # Calculate + update costs, return response + input_tokens = response.usage.prompt_tokens + output_tokens = response.usage.completion_tokens + self.update_stats(input_tokens, output_tokens) + return response.choices[0].message.content + + +class AnthropicModel(BaseModel): + MODELS = { + "claude-instant": { + "max_context": 100_000, + "cost_per_input_token": 1.63e-06, + "cost_per_output_token": 5.51e-06, + }, + "claude-2": { + "max_context": 100_000, + "cost_per_input_token": 1.102e-05, + "cost_per_output_token": 3.268e-05, + }, + "claude-2.1": { + "max_context": 100_000, + "cost_per_input_token": 1.102e-05, + "cost_per_output_token": 3.268e-05, + }, + "claude-3-opus-20240229": { + "max_context": 200_000, + "max_tokens": 4096, # Max tokens to generate for Claude 3 models + "cost_per_input_token": 1.5e-05, + "cost_per_output_token": 7.5e-05, + }, + "claude-3-sonnet-20240229": { + "max_context": 200_000, + "max_tokens": 4096, + "cost_per_input_token": 3e-06, + "cost_per_output_token": 1.5e-05, + }, + } + + SHORTCUTS = { + "claude": "claude-2", + "claude-opus": "claude-3-opus-20240229", + "claude-sonnet": "claude-3-sonnet-20240229", + } + + def __init__(self, args: ModelArguments, commands: list[Command]): + super().__init__(args, commands) + + # Set Anthropic key + cfg = config.Config(os.path.join(os.getcwd(), "keys.cfg")) + self.api = Anthropic(api_key=cfg["ANTHROPIC_API_KEY"]) + + def history_to_messages( + self, history: list[dict[str, str]], is_demonstration: bool = False + ) -> list[dict[str, str]]: + """ + Create `prompt` by filtering out all keys except for role/content per `history` turn + Reference: https://docs.anthropic.com/claude/reference/complete_post + """ + # Preserve behavior for older models + if self.api_model in ["claude-instant", "claude-2"]: + # Remove system messages if it is a demonstration + if is_demonstration: + history = [entry for entry in history if entry["role"] != "system"] + # Map history to Claude format + prompt = "\n\n" + for entry in history: + if entry["role"] in {"user", "system"}: + prompt += f'{HUMAN_PROMPT} {entry["content"]}\n\n' + elif entry["role"] == "assistant": + prompt += f'{AI_PROMPT} {entry["content"]}\n\n' + prompt += AI_PROMPT + return prompt + + # Remove system messages if it is a demonstration + if is_demonstration: + history = [entry for entry in history if entry["role"] != "system"] + return '\n'.join([entry["content"] for entry in history]) + + # Return history components with just role, content fields (no system message) + messages = [ + { + k: v for k, v in entry.items() + if k in ["role", "content"] + } + for entry in history if entry["role"] != "system" + ] + compiled_messages = [] # Combine messages from the same role + last_role = None + for message in reversed(messages): + if last_role == message["role"]: + compiled_messages[-1]["content"] = message["content"] + "\n" + compiled_messages[-1]["content"] + else: + compiled_messages.append(message) + last_role = message["role"] + compiled_messages = list(reversed(compiled_messages)) + # Replace any empty content values with a "(No output)" + for message in compiled_messages: + if message["content"].strip() == "": + message["content"] = "(No output)" + return compiled_messages + + @retry( + wait=wait_random_exponential(min=1, max=15), + reraise=True, + stop=stop_after_attempt(3), + retry=retry_if_not_exception_type((CostLimitExceededError, RuntimeError)), + ) + def query(self, history: list[dict[str, str]]) -> str: + """ + Query the Anthropic API with the given `history` and return the response. + """ + # Preserve behavior for older models + if self.api_model in ["claude-instant", "claude-2"]: + # Perform Anthropic API call + prompt = self.history_to_messages(history) + input_tokens = self.api.count_tokens(prompt) + completion = self.api.completions.create( + model=self.api_model, + prompt=prompt, + max_tokens_to_sample=self.model_metadata["max_context"] - input_tokens, + temperature=self.args.temperature, + top_p=self.args.top_p, + ) + # Calculate + update costs, return response + response = completion.completion + output_tokens = self.api.count_tokens(response) + self.update_stats(input_tokens, output_tokens) + return response + + # Get system message(s) + system_message = "\n".join([ + entry["content"] for entry in history if entry["role"] == "system" + ]) + messages = self.history_to_messages(history) + # Perform Anthropic API call + response = self.api.messages.create( + messages=messages, + max_tokens=self.model_metadata["max_tokens"], + model=self.api_model, + temperature=self.args.temperature, + top_p=self.args.top_p, + system=system_message, + ) + + # Calculate + update costs, return response + self.update_stats( + response.usage.input_tokens, + response.usage.output_tokens + ) + response = "\n".join([x.text for x in response.content]) + return response + + +class TogetherModel(BaseModel): + # Check https://docs.together.ai/docs/inference-models for model names, context + # Check https://www.together.ai/pricing for pricing + MODELS = { + "meta-llama/Llama-2-13b-chat-hf": { + "max_context": 4096, + "cost_per_input_token": 2.25e-07, + "cost_per_output_token": 2.25e-07, + }, + "meta-llama/Llama-2-70b-chat-hf": { + "max_context": 4096, + "cost_per_input_token": 9e-07, + "cost_per_output_token": 9e-07, + }, + "mistralai/Mistral-7B-Instruct-v0.2": { + "max_context": 32768, + "cost_per_input_token": 2e-07, + "cost_per_output_token": 2e-07, + }, + "togethercomputer/RedPajama-INCITE-7B-Chat": { + "max_context": 2048, + "cost_per_input_token": 2e-07, + "cost_per_output_token": 2e-07, + }, + "mistralai/Mixtral-8x7B-Instruct-v0.1": { + "max_context": 32768, + "cost_per_input_token": 6e-07, + "cost_per_output_token": 6e-07, + }, + } + + SHORTCUTS = { + "llama13b": "meta-llama/Llama-2-13b-chat-hf", + "llama70b": "meta-llama/Llama-2-70b-chat-hf", + "mistral7b": "mistralai/Mistral-7B-Instruct-v0.2", + "mixtral8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "redpajama7b": "togethercomputer/RedPajama-INCITE-7B-Chat", + } + + def __init__(self, args: ModelArguments, commands: list[Command]): + super().__init__(args, commands) + + # Set Together key + cfg = config.Config(os.path.join(os.getcwd(), "keys.cfg")) + together.api_key = cfg.TOGETHER_API_KEY + + def history_to_messages( + self, history: list[dict[str, str]], is_demonstration: bool = False + ) -> str: + """ + Create `prompt` by filtering out all keys except for role/content per `history` turn + """ + # Remove system messages if it is a demonstration + if is_demonstration: + history = [entry for entry in history if entry["role"] != "system"] + # Map history to TogetherAI format + mapping = {"user": "human", "assistant": "bot", "system": "bot"} + prompt = [f'<{mapping[d["role"]]}>: {d["content"]}' for d in history] + prompt = "\n".join(prompt) + prompt = f"{prompt}\n:" + return prompt + + @retry( + wait=wait_random_exponential(min=1, max=15), + reraise=True, + stop=stop_after_attempt(3), + retry=retry_if_not_exception_type((CostLimitExceededError, RuntimeError)), + ) + def query(self, history: list[dict[str, str]]) -> str: + """ + Query the Together API with the given `history` and return the response. + """ + # Perform Together API call + prompt = self.history_to_messages(history) + completion = together.Complete.create( + model=self.api_model, + prompt=prompt, + max_tokens=self.model_metadata["max_context"], + stop="", + temperature=self.args.temperature, + top_p=self.args.top_p, + ) + # Calculate + update costs, return response + response = completion["output"]["choices"][0]["text"].split("")[0] + input_tokens = completion["output"]["usage"]["prompt_tokens"] + output_tokens = completion["output"]["usage"]["completion_tokens"] + self.update_stats(input_tokens, output_tokens) + return response + + +class HumanModel(BaseModel): + MODELS = {"human": {}} + + def __init__(self, args: ModelArguments, commands: list[Command]): + super().__init__(args, commands) + + # Determine which commands require multi-line input + self.multi_line_command_endings = { + command.name: command.end_name + for command in commands + if command.end_name is not None + } + + def history_to_messages( + self, history: list[dict[str, str]], is_demonstration: bool = False + ) -> list[dict[str, str]]: + """ + Create `messages` by filtering out all keys except for role/content per `history` turn + """ + # Remove system messages if it is a demonstration + if is_demonstration: + history = [entry for entry in history if entry["role"] != "system"] + return '\n'.join([entry["content"] for entry in history]) + # Return history components with just role, content fields + return [ + {k: v for k, v in entry.items() if k in ["role", "content"]} + for entry in history + ] + + def query(self, history: list[dict[str, str]], action_prompt: str = "> ") -> str: + """ + Logic for handling user input to pass to SWEEnv + """ + action = input(action_prompt) + command_name = action.split()[0] if action else "" + + # Special handling for multi-line input actions (i.e. edit) + if command_name in self.multi_line_command_endings: + buffer = [action] + end_keyword = self.multi_line_command_endings[command_name] + while True: + action = input("... ") + buffer.append(action) + if action.rstrip() == end_keyword: + # Continue reading input until terminating keyword inputted + break + action = "\n".join(buffer) + elif action.strip() == "start_multiline_command": # do arbitrary multi-line input + buffer = [] + while True: + action = input("... ") + if action.rstrip() == "end_multiline_command": + break + buffer.append(action) + action = "\n".join(buffer) + return action + + +class HumanThoughtModel(HumanModel): + MODELS = {"human_thought": {}} + + def query(self, history: list[dict[str, str]]) -> str: + """ + Logic for handling user input (both thought + action) to pass to SWEEnv + """ + thought_all = "" + thought = input("Thought (end w/ END_THOUGHT): ") + while True: + if "END_THOUGHT" in thought: + thought = thought.split("END_THOUGHT")[0] + thought_all += thought + break + thought_all += thought + thought = input("... ") + + action = super().query(history, action_prompt="Action: ") + + return f"{thought_all}\n```\n{action}\n```" + + +class ReplayModel(BaseModel): + MODELS = {"replay": {}} + + def __init__(self, args: ModelArguments, commands: list[Command]): + super().__init__(args, commands) + + if self.args.replay_path == None or not os.path.exists(self.args.replay_path): + raise ValueError( + "--replay_path must point to a file that exists to run a replay policy" + ) + + self.replays = [ + list(json.loads(x).values())[0] + for x in open(self.args.replay_path, "r").readlines() + ] + self.replay_idx = 0 + self.action_idx = 0 + + def query(self, history: list[dict[str, str]]) -> str: + """ + Logic for tracking which replay action to pass to SWEEnv + """ + action = self.replays[self.replay_idx][self.action_idx] + self.action_idx += 1 + + # Assuming `submit` is always last action of replay trajectory + if action == "submit": + self.replay_idx += 1 + self.action_idx = 0 + + return action + + +def get_model(args: ModelArguments, commands: Optional[list[Command]] = None): + """ + Returns correct model object given arguments and commands + """ + if commands is None: + commands = [] + + if args.model_name == "human": + return HumanModel(args, commands) + if args.model_name == "human_thought": + return HumanThoughtModel(args, commands) + if args.model_name == "replay": + return ReplayModel(args, commands) + elif args.model_name.startswith("gpt") or args.model_name.startswith("ft:gpt"): + return OpenAIModel(args, commands) + elif args.model_name.startswith("claude"): + return AnthropicModel(args, commands) + else: + raise ValueError(f"Invalid model name: {args.model_name}") diff --git a/sweagent/agent/parsing.py b/sweagent/agent/parsing.py new file mode 100644 index 000000000..019ca4c59 --- /dev/null +++ b/sweagent/agent/parsing.py @@ -0,0 +1,312 @@ +import re +import json +import shlex +import string +import textwrap + +from abc import abstractmethod +from dataclasses import dataclass +from sweagent.agent.commands import Command +from typing import List + + +class FormatError(Exception): + pass + +# ABSTRACT BASE CLASSES + +class ParseFunctionMeta(type): + """ + Registry maps all inherited classes to their names. + """ + _registry = {} + + def __new__(cls, name, bases, attrs): + new_cls = super().__new__(cls, name, bases, attrs) + if name != "ParseFunction": + cls._registry[name] = new_cls + return new_cls + + +@dataclass +class ParseFunction(metaclass=ParseFunctionMeta): + """ + Abstract class for parsing functions. + We use get to generate the right parser based on the name of the parser. + """ + _error_message = None + + @abstractmethod + def __call__(self, model_response, commands: List[Command], strict=False): + raise NotImplementedError + + @property + def format_error_template(self): + if self._error_message is None: + raise NotImplementedError("You must define an error message for your parser.") + return textwrap.dedent(self._error_message) + + @classmethod + def get(cls, name): + try: + return cls._registry[name]() + except KeyError: + raise ValueError(f"Model output parser ({name}) not found.") + + +# DEFINE NEW PARSING FUNCTIONS BELOW THIS LINE + +class ActionParser(ParseFunction): + """ + Expects the model response to be a single command. + Example: "ls -l" + """ + _error_message = """\ + The command you provided was not recognized. Please specify one of the commands (+ any necessary arguments) from the following list in your response. Do not include any other text. + + COMMANDS: + {command_docs} + """ + + def __call__(self, model_response, commands: List[Command], strict=False): + if model_response.split(): + action = model_response.strip().split()[0] + if action in {command.name for command in commands}: + return model_response, model_response + raise FormatError("First word in model response is not a valid command.") + + +class ThoughtActionParser(ParseFunction): + """ + Expects the model response to be a discussion followed by a command wrapped in backticks. + Example: + Let's look at the files in the current directory. + ``` + ls -l + ``` + """ + _error_message = """\ + Your output was not formatted correctly. You must always include one discussion and one command as part of your response. Make sure you do not have multiple discussion/command tags. + Please make sure your output precisely matches the following format: + DISCUSSION + Discuss here with yourself about what your planning and what you're going to do in this step. + + ``` + command(s) that you're going to run + ``` + """ + + def __call__(self, model_response, commands: List[Command], strict=False): + """ + Parses the action from the output of the API call. + We assume that the action is the last code block in the model_response. + We also assume that the action is not nested within another code block. + This is problematic if the model_response includes many unnamed ``` blocks. + For instance: + ``` + This is a code block. + ``` + ``` + This is another code block. + ``` + + In this case, only the second code block will be parsed as the action. + """ + code_block_pat = re.compile(r'^```(\S*)\s*\n|^```\s*$', re.MULTILINE) + stack = [] + last_valid_block = None + for match in code_block_pat.finditer(model_response): + if stack and not match.group(1): # Closing of a code block + start = stack.pop() + # Check if it's not nested within another block + if not stack: + last_valid_block = (start, match) + elif match.group(1) is not None: # Opening of a code block + stack.append(match) + if last_valid_block: + start, end = last_valid_block + thought = model_response[:start.start()] + model_response[end.end():] + return thought, model_response[start.end():end.start()] + raise FormatError("No action found in model response.") + + +class XMLThoughtActionParser(ParseFunction): + """ + Expects the model response to be a discussion followed by a command wrapped in XML tags. + Example: + Let's look at the files in the current directory. + + ls -l + + """ + _error_message = """\ + Your output was not formatted correctly. You must always include one discussion and one command as part of your response. Make sure you do not have multiple discussion/command tags. + Please make sure your output precisely matches the following format: + """ + + def __call__(self, model_response, commands: List[Command], strict=False): + """ + Parses the action from the output of the API call. + We assume that the action is the last code block in the model_response. + We also assume that the action is not nested within another code block. + This is problematic if the model_response includes many unnamed ``` blocks. + For instance: + + This is a code block. + + + This is another code block. + + + In this case, only the second code block will be parsed as the action. + """ + if "" not in model_response or "" not in model_response: + raise FormatError("No action found in model response.") + # `action` is everything between the last and tags + start_action = model_response.rfind('') + len('') # start after the last tag + end_thought = model_response.rfind('') # end before the last tag + end_action = model_response.rfind('') # end before the last tag + restart_thought = model_response.rfind('') + len('') # start after the last tag + # `thought` is everything not in between and tags (includes after the last tag) + action = model_response[start_action:end_action] + thought = model_response[:end_thought] + model_response[restart_thought:] + + return thought.strip(), action.strip() + + +class EditFormat(ThoughtActionParser): + """ + Expects the model response to be a discussion followed by a command wrapped in backticks. + Example: + We'll replace the contents of the current window with the following: + ``` + import os + os.listdir() + ``` + """ + _error_message = """\ + Your output was not formatted correctly. You must wrap the replacement text in backticks (```). + Please make sure your output precisely matches the following format: + COMMENTS + You can write comments here about what you're going to do if you want. + + ``` + New window contents. + Make sure you copy the entire contents of the window here, with the required indentation. + Make the changes to the window above directly in this window. + Remember that all of the window's contents will be replaced with the contents of this window. + Don't include line numbers in your response. + ``` + """ + + +class Identity(ParseFunction): + """ + This parser does not do any parsing. It just returns the model response as both the thought and action. + """ + _error_message = """\ + It seems like something went wrong with your output. Please try again. + """ + + def __call__(self, model_response, commands: List[Command], strict=False): + """ + This doesn't do any parsing. It just returns the model response as the thought and action. + """ + return model_response, model_response + + +class JsonParser(ParseFunction): + """ + Expects the model response to be a JSON object. + """ + _error_message = """\ + Your output could not be parsed as JSON. Please make sure your output 1) is valid JSON and + 2) Includes the "thought" and "command" fields. + + """ + + def __call__(self, model_response, commands: List[Command], strict=False): + """ + Parses the action from the output of the API call. + We assume that model output is a JSON object with the following fields: + { + "thought": "discussion text here.", + "command": { + "arguments": { + "arg1": "value1", + "arg2": "value2", + ... + }, + "name": "command_name" + } + } + """ + try: + data = json.loads(model_response) + if not isinstance(data, dict): + raise FormatError("Model output is not a JSON object.") + + # Check if required keys are present + required_keys = ["thought", "command"] + for key in required_keys: + if key not in data: + raise FormatError(f"Key '{key}' is missing from model output.") + + # Check structure of 'command' key + data_command = data["command"] + if not isinstance(data_command, dict): + raise FormatError("Value of 'command' key is not a JSON object.") + + # Check if required keys are present in 'command' object + command_keys = ["name"] + for key in command_keys: + if key not in data_command: + raise FormatError(f"Key '{key}' is missing from 'command' object.") + + thought = data["thought"] + + # Generate action + commands_dict = {c.name: c for c in commands} + command = commands_dict.get(data_command["name"]) + if command is None: + action = data_command['name'] + if "arguments" in data_command: + action += " " + ' '.join(data_command["arguments"].values()) + else: + signature = command.signature + signature = signature.replace("[", "").replace("]", "")\ + .replace("<", "{").replace(">", "}") + signature_args = extract_keys(signature) + command_args = {k: "" for k in signature_args} + + if "arguments" in data_command: + for arg in signature_args: + if arg in data_command["arguments"]: + value = data_command["arguments"][arg] + if should_quote(value, command): + value = shlex.quote(value) + command_args[arg] = value + action = signature.format(**command_args) + action = action.strip() + return thought, action + except json.JSONDecodeError: + raise FormatError("Model output is not valid JSON.") + + +def extract_keys(format_string): + """ + Given a format string, returns a set of all the keys in the format string. + """ + formatter = string.Formatter() + keys = set() + for _, field_name, _, _ in formatter.parse(format_string): + if field_name is not None: + keys.add(field_name) + return keys + + +def should_quote(value, command): + """ + Returns True if the value should be quoted, False otherwise. + """ + return (isinstance(value, str) and command.end_name is None) \ No newline at end of file diff --git a/sweagent/environment/__init__.py b/sweagent/environment/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sweagent/environment/swe_env.py b/sweagent/environment/swe_env.py new file mode 100644 index 000000000..fbb141578 --- /dev/null +++ b/sweagent/environment/swe_env.py @@ -0,0 +1,662 @@ +import config +import datetime +import docker +import gymnasium as gym +import hashlib +import logging +import os +import re +import subprocess +import traceback +import time + +from dataclasses import dataclass +from git import Repo +from rich.logging import RichHandler +from simple_parsing.helpers import FrozenSerializable +from sweagent.environment.utils import ( + copy_file_to_container, + get_container, + get_instances, + is_from_github_url, + read_with_timeout, + LOGGER_NAME, +) +from swebench import ( + get_environment_yml, + get_requirements, + MAP_VERSION_TO_INSTALL +) +from typing import Optional, Tuple + +LONG_TIMEOUT = 500 +PATH_TO_REQS = "/root/requirements.txt" +PATH_TO_ENV_YML = "/root/environment.yml" + +handler = RichHandler(show_time=False, show_path=False) +handler.setLevel(logging.DEBUG) +logger = logging.getLogger(LOGGER_NAME) +logger.setLevel(logging.DEBUG) +logger.addHandler(handler) +logger.propagate = False + + +@dataclass(frozen=True) +class EnvironmentArguments(FrozenSerializable): + data_path: str + image_name: str + split: str = "dev" + base_commit: Optional[str] = None # used only with data_path as url + container_name: Optional[str] = None + install_environment: bool = True + timeout: int = 35 + verbose: bool = False + no_mirror: bool = False + + +class SWEEnv(gym.Env): + """Gym environment for SWE-bench. This class should handle all communication with the docker container.""" + + name = "swe_main" + + def __init__(self, args: EnvironmentArguments): + super().__init__() + self.args = args + self.base_commit = None + self.communicate_output = None + self.container_name = args.container_name + self.install_environment = args.install_environment + self.logger = logger + self.persistent = args.container_name is not None + self.returncode = None + self.is_from_github_url = is_from_github_url(args.data_path) + if not self.args.verbose: + self.logger.disabled = True + + # Get commit hash + try: + repo = Repo(search_parent_directories=True) + self.commit_sha = repo.head.object.hexsha + except KeyboardInterrupt: + raise + except: + logger.warning("Failed to get commit hash for this repo") + self.commit_sha = None + + # Load Task Instances + self.data_path = self.args.data_path + self.data = get_instances(self.data_path, self.args.base_commit, self.args.split) + self.logger.info(f"๐Ÿ’ฝ Loaded dataset from {self.data_path}") + + # Set GitHub Token + self.token = os.environ.get("GITHUB_TOKEN", None) + if (self.token is None or self.token == "") and os.path.isfile( + os.path.join(os.getcwd(), "keys.cfg") + ): + self.cfg = config.Config(os.path.join(os.getcwd(), "keys.cfg")) + self.token = self.cfg.get("GITHUB_TOKEN", "git") + + # Establish connection with execution container + self.image_name = args.image_name + self._reset_container() + + # Set timeout + self.timeout = self.args.timeout + self.idx = 0 + self.clean_multi_line_functions = lambda x: x + + def reset(self, index: int = None, apply_test_patch: bool = False) -> Tuple[str, dict]: + """ + Function to reset container between each task instance. + * Clones instance's repository + * Cleans repository of prior modifications + * Resets environment variables + * Check out base commit + + Arguments: + index (`int`) - index of task instance to reset to + Returns: + observation (`str`) - output from container + info (`dict`) - additional information (e.g. debugging information) + """ + info = {} + info["commit_sha"] = self.commit_sha + + # Get task instance + self.idx = index if index is not None else self.idx + self.record = self.data[self.idx] + self.idx += 1 + + # Set query, gold command + self.base_commit = self.record["base_commit"] + self.query = self.record["problem_statement"] + self.reward = None + + ### Reset Container ### + + # Clone repository if not already cloned + self.communicate(input="cd /") + folders = self.communicate(input="ls").split("\n") + repo_name = self.record["repo"].replace("/", "__") + if repo_name not in folders: + if not self.args.no_mirror and not self.is_from_github_url: + self.logger.info(f"{repo_name} not found in container, cloning...") + self.communicate_with_handling( + input=f"git clone https://{self.token}@github.com/swe-bench/{repo_name}.git", + error_msg="Failed to clone repository from mirror", + timeout_duration=LONG_TIMEOUT, + ) + else: + logger.info(f"Trying to clone from non-mirror...") + self.communicate_with_handling( + input=f"git clone https://{self.token}@github.com/{self.record['repo']}.git {repo_name}", + error_msg="Failed to clone repository from non-mirror", + timeout_duration=LONG_TIMEOUT, + ) + + # Clean repository of any modifications + Checkout base commit + for cmd in [ + "echo -n > /root/files_to_edit.txt", + f"cd {repo_name}", + "export ROOT=$(pwd -P)", + "git status", + "git restore .", + f"git reset --hard {self.base_commit}", + "git clean -fdxq", + ]: + self.communicate_with_handling( + input=cmd, + error_msg="Failed to clean repository", + ) + + # Reset environment variables + for cmd in [ + 'export CURRENT_FILE=""', + "export CURRENT_LINE=0", + "export SEARCH_RESULTS=()", + "export SEARCH_FILES=()", + "export SEARCH_INDEX=0", + ]: + self.communicate_with_handling( + input=cmd, + error_msg="Failed to reset environment variables", + ) + + # Set up environment + self.communicate_with_handling( + "source /root/miniconda3/etc/profile.d/conda.sh", + error_msg="Failed to source conda", + ) + + system = self.communicate("uname -s").strip().lower() + arch = self.communicate("uname -m").strip().lower() + if system == 'linux' and arch == 'x86_64': + self.communicate_with_handling( + f"apt update; apt install build-essential -y", + error_msg="Failed to install build-essential", + timeout_duration=LONG_TIMEOUT, + ) + + # Call install environment helper function if specified + if self.install_environment: + if self.is_from_github_url: + logger.warning(( + "install_environment is set to True, but the data path is a GitHub URL. " + "Skipping conda environment installation." + )) + else: + self.install_env() + # Install mypy for linting purposes + self.communicate_with_handling( + f"pip install flake8", + error_msg="Failed to install flake8 (lint library)" + ) + + # Apply test patch for oracle setting + if apply_test_patch: + path_to_patch = "test.patch" + with open(path_to_patch, "w") as f: + f.write(self.record["test_patch"]) + subprocess.run( + f"docker cp {path_to_patch} {self.container_name}:/root/test.patch", + shell=True, + ) + self.communicate_with_handling( + input="git apply /root/test.patch", + error_msg="Failed to apply test patch correctly" + ) + os.remove(path_to_patch) + + + # Write any metadata to info if necessary + return None, info + + def step(self, action: str) -> Tuple[str, int, bool, dict]: + """ + Runs given action in environment and returns corresponding output + + Args: + action (`str`) - command to run in bash shell + + Returns: + observation (`str`) - output from container + reward (`float`) - value between 0 and 1 quantifying correctness of output + environment state + done (`bool`) - whether task is over + info (`dict`) - additional information (e.g. debugging information) + """ + info = {} + + observation = "" + # Handle special actions + if action.strip() == "skip": + observation = "Skipped" + info["exit_status"] = "skipped" + return observation, 0, True, info + if action in {"exit_context", "exit_cost", "exit_error", "exit_format", "exit_api"}: + try: + observation = self.communicate(input="submit") + submission = self.get_submission('submit', observation) + assert submission is not None and submission.strip() != "", AssertionError('No submission found.') + self.logger.info(f"Found submission: {submission}") + info["exit_status"] = f"submitted ({action})" + info["submission"] = submission + observation = "Exited (autosubmitted)" + logger.info("Exiting with autosubmission") + return observation, 0, True, info + except KeyboardInterrupt: + raise + except: + observation = "Exited" + info["exit_status"] = action + return observation, 0, True, info + + # Attempt to run action in container + observation = "" + try: + observation = self.communicate(input=action, timeout_duration=25) + except TimeoutError: + try: + self.interrupt() + observation += "\nEXECUTION TIMED OUT" + except RuntimeError as e: + observation += "\nEXECUTION TIMED OUT AND INTERRUPT FAILED. RESTARTING PROCESS." + info["exit_status"] = "early_exit" + logger.warning(f"Failed to interrupt container: {e}\nRESTARTING PROCESS.") + self.reset_container() + return observation, 0, True, info + except RuntimeError as e: + observation += "\nCOMMAND FAILED TO EXECUTE. RESTARTING PROCESS." + info["exit_status"] = "early_exit" + logger.warning(f"Failed to execute command: {e}\nRESTARTING PROCESS.") + self.reset_container() + return observation, 0, True, info + except BrokenPipeError: + observation += "\nBROKEN PIPE ERROR. RESTARTING PROCESS." + info["exit_status"] = "early_exit" + logger.error(f"Broken pipe error: {e}\nRESTARTING PROCESS.") + self.reset_container() + return observation, 0, True, info + except Exception as e: + observation += "\nEXECUTION FAILED OR COMMAND MALFORMED" + + # Record submission and end episode if `submit` keyword found + submission = self.get_submission(action, observation) + if submission is not None: + self.logger.info(f"Found submission: {submission}") + info["exit_status"] = "submitted" + info["submission"] = submission if submission.strip() != "" else None + observation = submission if submission.strip() != "" else None + return observation, 0, True, info + return observation, 0, False, info + + def close(self): + """ + Handle environment shutdown + """ + self.logger.info("Beginning environment shutdown...") + try: + self.communicate(input="exit") + except KeyboardInterrupt: + raise + except: + pass + self.container.terminate() + if self.persistent: + if self.container_obj.status not in {"paused", "exited"}: + self.container_obj.pause() + self.logger.info("Agent container paused") + else: + self.logger.info(f"Agent container status: {self.container_obj.status}") + else: + try: + self.container_obj.remove(force=True) + except KeyboardInterrupt: + raise + except: + pass + self.logger.info("Agent container stopped") + + # MARK: Helper functions # + + def _reset_container(self) -> None: + if hasattr(self, "container"): + try: + self.container.terminate() + except KeyboardInterrupt: + raise + except: + pass + self._init_container() + self._init_scripts() + + def reset_container(self) -> None: + self.close() + self.container = None + self.container_obj = None + self._reset_container() + + def _init_container(self) -> None: + """ + Handles container initialization. Defines container name and creates it + """ + if self.container_name is None: + process_id = str(os.getpid()) + current_time = str(datetime.datetime.now()) + unique_string = current_time + process_id + hash_object = hashlib.sha256(unique_string.encode()) + self.container_name = f"{self.image_name}-{hash_object.hexdigest()[:10]}" + self.container, self.parent_pids = get_container( + self.container_name, self.image_name, persistent=self.persistent + ) + client = docker.from_env() + self.container_obj = client.containers.get(self.container_name) + self.logger.info("๐ŸŒฑ Environment Initialized") + + def _init_scripts(self): + """ + Initialize custom commands within container + """ + self.communicate_with_handling( + "source /root/.bashrc", + error_msg="Failed to source .bashrc", + ) + self.communicate_with_handling( + "mkdir -p /root/commands", + error_msg="Failed to create commands directory", + ) + self.communicate_with_handling( + "touch /root/commands/__init__.py", + error_msg="Failed to create __init__.py", + ) + self.communicate_with_handling( + "export PATH=$PATH:/root/commands", + error_msg="Failed to add commands directory to PATH", + ) + + def _communicate( + self, + input: str, + timeout_duration=25, + ) -> str: + try: + self.returncode = None + cmd = input if input.endswith("\n") else input + "\n" + self.container.stdin.write(cmd) + time.sleep(0.1) + self.container.stdin.flush() + except BrokenPipeError: + traceback.print_exc() + self.logger.error( + "Failed to communicate with container. Check docker logs for more information." + ) + raise RuntimeError("Failed to communicate with container") + try: + buffer = read_with_timeout(self.container, self.get_pids, timeout_duration) + self.container.stdin.write("echo $?\n") + time.sleep(0.1) + self.container.stdin.flush() + exit_code = read_with_timeout(self.container, self.get_pids, 5).strip() + except Exception as e: + self.logger.error(f"Read with timeout failed on input:\n---\n{input}\n---") + raise e + if not exit_code.isdigit(): + raise RuntimeError(f"Container crashed. Failed to get exit code. Output:\n---\n{buffer}\n---") + self.returncode = int(exit_code) + return buffer + + def _check_syntax(self, input: str) -> None: + """ + Saves environment variables to file + """ + output = self._communicate(f"/bin/bash -n <<'EOF'\n{input}\nEOF\n") + return output, self.returncode == 0 + + def communicate( + self, + input: str, + timeout_duration=25, + ) -> str: + """ + Sends input to container and returns output + + Args: + input (`str`) - input to send to container + + Returns: + output (`str`) - output from container + """ + if input.strip() != "exit": + output, valid = self._check_syntax(input) + if not valid: + return output # shows syntax errors + output = self._communicate( + input, timeout_duration=timeout_duration, + ) + self.communicate_output = output + return output + else: + self.container.terminate() + self.returncode = 0 + self.communicate_output = "" + return "" + + def communicate_with_handling( + self, input: str, error_msg: str, timeout_duration=25 + ): + """ + Wrapper for communicate function that raises error if return code is non-zero + """ + logs = self.communicate(input, timeout_duration=timeout_duration) + if self.returncode != 0: + self.logger.error(f"{error_msg}: {logs}") + self.close() + raise RuntimeError(f"{error_msg}: {logs}") + + def get_available_actions(self) -> list[str]: + """ + Returns list of available actions in current environment state + """ + return [] + + def get_pids(self, all_pids=False) -> list[str]: + """ + Gets list of processes running inside docker container + """ + pids = ( + self.container_obj.exec_run("ps -eo pid,comm --no-headers") + .output.decode() + .split("\n") + ) + pids = [x.split() for x in pids if x] + if not all_pids: + pids = [x for x in pids if x[1] != "ps" and x[0] not in self.parent_pids] + return pids + + def get_submission(self, action, output: str) -> str: + """ + Function for extracting diff patch submission at the end of an episode. + + Args: + output (`str`) - `submit` observation + Returns: + submission (`str`) - diff patch submission + """ + pattern = r"\<\\>" + match = re.search(pattern, output, re.DOTALL) + if match is None: + return None + return match.group(1) + + def install_env(self) -> None: + """ + Creates conda environment and installs third party dependencies to allow code execution + """ + repo_name = self.record["repo"].replace("/", "__") + # Create environment if does not exist yet + env_name = f"{repo_name}__{self.record['version']}" + env_check = self.communicate( + f"conda env list | grep {env_name}", timeout_duration=LONG_TIMEOUT + ) + install_configs = MAP_VERSION_TO_INSTALL[self.record["repo"]][ + str(self.record["version"]) + ] + if env_check.strip() == "": + self.logger.info(f"{env_name} conda env not found, creating...") + packages = ( + install_configs.get("packages", "") + ) + if packages == "requirements.txt": + # Create conda environment + self.communicate_with_handling( + f"conda create -n {env_name} python={install_configs['python']} -y", + error_msg="Failed to create conda environment", + timeout_duration=LONG_TIMEOUT, + ) + # Write reqs to requirements.txt in docker container + content_reqs = get_requirements(self.record) + copy_file_to_container(self.container_obj, content_reqs, PATH_TO_REQS) + # Create conda environment + install reqs + self.communicate_with_handling( + f"conda activate {env_name}", + error_msg="Failed to activate conda environment", + ) + self.communicate_with_handling( + f"pip install -r {PATH_TO_REQS}", + error_msg="Failed to install requirements.txt", + timeout_duration=LONG_TIMEOUT, + ) + self.communicate(f"rm {PATH_TO_REQS}") + elif packages == "environment.yml": + # Write environment.yml to file + content_env_yml = get_environment_yml(self.record, env_name) + copy_file_to_container(self.container_obj, content_env_yml, PATH_TO_ENV_YML) + if "no_use_env" in install_configs and install_configs["no_use_env"]: + # Create conda environment + self.communicate_with_handling( + f"conda create -c conda-forge -n {env_name} python={install_configs['python']} -y", + error_msg="Failed to create conda environment", + timeout_duration=LONG_TIMEOUT, + ) + # Install packages + self.communicate_with_handling( + f"conda env update -f {PATH_TO_ENV_YML}", + error_msg="Failed to install environment.yml", + timeout_duration=LONG_TIMEOUT + ) + else: + # Create environment + install packages + self.communicate_with_handling( + f"conda env create --file {PATH_TO_ENV_YML}", + error_msg="Failed to create conda environment with environment.yml", + timeout_duration=LONG_TIMEOUT, + ) + self.communicate(f"rm {PATH_TO_ENV_YML}") + else: + # Create environment + install packages + self.communicate_with_handling( + f"conda create -n {env_name} python={install_configs['python']} {packages} -y", + error_msg="Failed to create conda environment", + timeout_duration=LONG_TIMEOUT, + ) + # Install extra pip packages if specified + if "pip_packages" in install_configs: + self.communicate_with_handling( + f"source activate {env_name} && pip install {install_configs['pip_packages']}", + error_msg="Failed to install pip packages", + timeout_duration=LONG_TIMEOUT + ) + + # Activate environment + self.communicate_with_handling( + f"conda activate {env_name}", + error_msg="Failed to activate conda environment" + ) + + # Install repo at base commit + if "pre_install" in install_configs: + self.logger.info("Running pre-install commands...") + for pre_install_cmd in install_configs["pre_install"]: + self.communicate_with_handling( + pre_install_cmd, + error_msg="Pre-install commands failed to execute successfully", + ) + self.logger.info(f"Installing {repo_name} at base commit...") + if "install" in install_configs: + install_cmd = install_configs["install"] + self.communicate_with_handling( + install_cmd, + error_msg="Install command failed to execute successfully", + timeout_duration=LONG_TIMEOUT + ) + if "post_install" in install_configs: + self.logger.info("Running post-install commands...") + for post_install_cmd in install_configs["post_install"]: + self.communicate_with_handling( + post_install_cmd, + error_msg="Post-install commands failed to execute successfully", + ) + + def add_commands(self, commands: list[dict]) -> None: + """ + Adds custom commands to container + """ + for command in commands: + name = command["name"] + contents = command["contents"] + copy_file_to_container(self.container_obj, contents, f"/root/commands/{name}") + if command['type'] == "source_file": + self.communicate_with_handling( + f"source /root/commands/{name}", + error_msg=( + f"Failed to source {name}. If you meant to make a script," + " start the file with a shebang (e.g. #!/usr/bin/env python)." + ) + ) + elif command['type'] == "script": + self.communicate_with_handling( + f"chmod +x /root/commands/{name}", + error_msg=f"Failed to chmod {name}", + ) + elif command['type'] == "utility": + # nothing to do for utility scripts + pass + else: + raise ValueError(f"Invalid command type: {command['type']}") + + def interrupt(self): + """ + Send interrupt signal to container and exhaust stdout buffer with a communicate call + """ + pids = self.get_pids() + for pid, cmd in pids: + if pid not in self.parent_pids and cmd != "ps": + self.container_obj.exec_run(f"kill -9 {pid}") + try: + _ = read_with_timeout(self.container, self.get_pids, 20) + except TimeoutError: + pass + try: + output = self.communicate(input="echo 'interrupted'", timeout_duration=5) + assert output.strip().endswith("interrupted"), "container health check failed" + except TimeoutError: + raise RuntimeError("Failed to interrupt container") diff --git a/sweagent/environment/utils.py b/sweagent/environment/utils.py new file mode 100644 index 000000000..c3efc6559 --- /dev/null +++ b/sweagent/environment/utils.py @@ -0,0 +1,328 @@ +import docker +import json +import logging +import os +import re +import select +import signal +import subprocess +import tarfile +import tempfile +import time +import traceback + +from datasets import load_dataset, load_from_disk +from ghapi.all import GhApi +from io import BytesIO +from pathlib import Path +from subprocess import PIPE, STDOUT +from typing import Tuple + +LOGGER_NAME = "intercode" +START_UP_DELAY = 5 +TIMEOUT_DURATION = 25 +GITHUB_ISSUE_URL_PATTERN = re.compile(r'github\.com\/(.*?)\/(.*?)\/issues\/(\d+)') + +logger = logging.getLogger(LOGGER_NAME) + + +def get_data_path_name(data_path: str): + # if data_path is a file, return the file stem + # elif it's a github url, return the owner__repo_name + match = GITHUB_ISSUE_URL_PATTERN.search(data_path) + if match: + owner, repo, issue_number = match.groups() + return f"{owner}__{repo}" + return Path(data_path).stem + + +def is_from_github_url(data_path: str): + return GITHUB_ISSUE_URL_PATTERN.search(data_path) is not None + + +def copy_file_to_container(container, contents, container_path): + """ + Copies a given string into a Docker container at a specified path. + + Args: + - container: Docker SDK container object. + - contents: The string to copy into the container. + - container_path: The path inside the container where the string should be copied to. + + Returns: + - None + """ + temp_file_name = None + + try: + # Create a temporary file + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file_name = temp_file.name + # Write the string to the temporary file and ensure it's written to disk + temp_file.write(contents.encode('utf-8')) + temp_file.flush() + os.fsync(temp_file.fileno()) + + # Create a TAR archive in memory containing the temporary file + with tempfile.NamedTemporaryFile() as temp_tar: + with open(temp_file_name, 'rb') as temp_file: + # Prepare the TAR archive + with BytesIO() as tar_stream: + with tarfile.open(fileobj=tar_stream, mode='w') as tar: + tar_info = tarfile.TarInfo(name=os.path.basename(container_path)) + tar_info.size = os.path.getsize(temp_file_name) + tar.addfile(tarinfo=tar_info, fileobj=temp_file) + tar_stream.seek(0) + # Copy the TAR stream to the container + container.put_archive(path=os.path.dirname(container_path), data=tar_stream.read()) + + except Exception as e: + logger.error(f"An error occurred: {e}") + logger.error(traceback.format_exc()) + finally: + # Cleanup: Remove the temporary file if it was created + if temp_file_name and os.path.exists(temp_file_name): + os.remove(temp_file_name) + + +def read_with_timeout(container, pid_func, timeout_duration): + """ + Read data from a subprocess with a timeout. + This function uses a file descriptor to read data from the subprocess in a non-blocking way. + + Args: + container (subprocess.Popen): The subprocess container. + pid_func (function): A function that returns a list of process IDs (except the PID of the main process). + timeout_duration (int): The timeout duration in seconds. + + Returns: + str: The data read from the subprocess, stripped of trailing newline characters. + + Raises: + TimeoutError: If the timeout duration is reached while reading from the subprocess. + """ + buffer = b"" + fd = container.stdout.fileno() + end_time = time.time() + timeout_duration + + while time.time() < end_time: + pids = pid_func() + if len(pids) > 0: + # There are still PIDs running + time.sleep(0.05) + continue + ready_to_read, _, _ = select.select([fd], [], [], 0.1) + if ready_to_read: + data = os.read(fd, 4096) + if data: + buffer += data + else: + # No more data to read + break + time.sleep(0.05) # Prevents CPU hogging + + if container.poll() is not None: + raise RuntimeError("Subprocess exited unexpectedly.\nCurrent buffer: {}".format(buffer.decode())) + if time.time() >= end_time: + raise TimeoutError("Timeout reached while reading from subprocess.\nCurrent buffer: {}\nRunning PIDs: {}".format(buffer.decode(), pids)) + return buffer.decode() + + +class timeout: + def __init__(self, seconds=TIMEOUT_DURATION, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + signal.alarm(0) + + +def get_background_pids(container_obj): + pids = ( + container_obj.exec_run("ps -eo pid,comm --no-headers") + .output.decode() + .split("\n") + ) + pids = [x.split() for x in pids if x] + pids = [x for x in pids if x[1] not in {"ps"} and x[0] != "1"] + bash_pids = [x for x in pids if x[1] == "bash"] + other_pids = [x for x in pids if x[1] not in {"bash"}] + return bash_pids, other_pids + + +def _get_non_persistent_container(ctr_name: str, image_name: str) -> Tuple[subprocess.Popen, set]: + startup_cmd = [ + "docker", + "run", + "-i", + "--rm", + "--name", + ctr_name, + image_name, + "/bin/bash", + "-l", + "-m", + ] + container = subprocess.Popen( + startup_cmd, + stdin=PIPE, + stdout=PIPE, + stderr=STDOUT, + text=True, + bufsize=1, # line buffered + ) + time.sleep(START_UP_DELAY) + # try to read output from container setup (usually an error), timeout if no output + try: + with timeout(seconds=2): + output = container.stdout.read() + if output: + logger.error(f"Unexpected container setup output: {output}") + except TimeoutError: + pass + return container, {"1", } # bash PID is always 1 for non-persistent containers + + +def _get_persistent_container(ctr_name: str, image_name: str, persistent: bool = False) -> Tuple[subprocess.Popen, set]: + client = docker.from_env() + containers = client.containers.list(all=True, filters={"name": ctr_name}) + if ctr_name in [c.name for c in containers]: + container_obj = client.containers.get(ctr_name) + if container_obj.status in {"created"}: + container_obj.start() + elif container_obj.status in {"running"}: + pass + elif container_obj.status in {"exited"}: + container_obj.restart() + elif container_obj.status in {"paused"}: + container_obj.unpause() + else: + raise RuntimeError(f"Unexpected container status: {container_obj.status}") + else: + container_obj = client.containers.run( + image_name, + command='/bin/bash -l -m', + name=ctr_name, + stdin_open=True, + tty=True, + detach=True, + auto_remove=not persistent, + ) + container_obj.start() + startup_cmd = [ + "docker", + "exec", + "-i", + ctr_name, + "/bin/bash", + "-l", + "-m", + ] + container = subprocess.Popen( + startup_cmd, + stdin=PIPE, + stdout=PIPE, + stderr=STDOUT, + text=True, + bufsize=1, # line buffered + ) + time.sleep(START_UP_DELAY) + # try to read output from container setup (usually an error), timeout if no output + try: + with timeout(seconds=2): + output = container.stdout.read() + if output: + logger.error(f"Unexpected container setup output: {output}") + except TimeoutError: + pass + # Get the process IDs of the container + # There should be at least a head process and possibly one child bash process + bash_pids, other_pids = get_background_pids(container_obj) + bash_pid = 1 + if len(bash_pids) == 1: + bash_pid = bash_pids[0][0] + elif len(bash_pids) > 1 or len(other_pids) > 0: + raise RuntimeError(f"Detected alien processes attached or running. Please ensure that no other agents are running on this container. PIDs: {bash_pids}, {other_pids}") + return container, set(map(str, [bash_pid, 1, ])) + + +def get_container(ctr_name: str, image_name: str, persistent: bool = False) -> subprocess.Popen: + """ + Get a container object for a given container name and image name + + Arguments: + ctr_name (str): Name of container + image_name (str): Name of image + persistent (bool): Whether to use a persistent container or not + Returns: + Container object + """ + if persistent: + return _get_persistent_container(ctr_name, image_name) + else: + return _get_non_persistent_container(ctr_name, image_name) + + +def get_commit(api: GhApi, owner: str, repo: str, base_commit: str = None): + if base_commit: + commit = api.repos.get_commit(owner, repo, base_commit) + else: + commit = api.repos.list_commits(owner, repo)[0] + return commit + + + +def get_instances(file_path: str, base_commit: str = None, split: str = None): + """ + Getter function for handling json, jsonl files + + Arguments: + file_path (str): Path to file + Returns: + List of instances + """ + # If file_path is a directory, attempt load from disk + if os.path.isdir(file_path): + return load_from_disk(file_path, split=split) + + # If file_path is a github issue url, fetch the issue and return a single instance + if is_from_github_url(file_path): + match = GITHUB_ISSUE_URL_PATTERN.search(file_path) + api = GhApi() + if match: + owner, repo, issue_number = match.groups() + record = dict() + issue = api.issues.get(owner, repo, issue_number) + title = issue.title if issue.title else "" + body = issue.body if issue.body else "" + text = f"{title}\n{body}\n" + record["repo"] = f"{owner}/{repo}" + record["base_commit"] = base_commit if base_commit else get_commit(api, owner, repo, base_commit).sha + record["version"] = record["base_commit"][:7] + record["problem_statement"] = text + record["instance_id"] = f"{owner}__{repo}-i{issue_number}" + return [record,] + elif base_commit is not None: + raise ValueError("base_commit must be None if data_path is not a github issue url") + + # If file_path is a file, load the file + if file_path.endswith(".json"): + return json.load(open(file_path)) + if file_path.endswith(".jsonl"): + return [json.loads(x) for x in open(file_path, 'r').readlines()] + + # Attempt load from HF datasets as a last resort + try: + return load_dataset(file_path, split=split) + except: + raise ValueError( + f"Could not load instances from {file_path}. " + "Please ensure --data_path is a GitHub URL, a SWE-bench HuggingFace dataset, or a JSON/JSONL file." + ) \ No newline at end of file diff --git a/trajectories/README.md b/trajectories/README.md new file mode 100644 index 000000000..5b4d81c11 --- /dev/null +++ b/trajectories/README.md @@ -0,0 +1,45 @@ +# Trajectories + +The `trajectories/` folder is the default location that experiment results (invocations of `run.py`) will be written to. + +At a high level, the experiments folder is organized in the following manner: +``` +trajectories +โ”œโ”€โ”€ ๐Ÿ‘ฉโ€๐Ÿ’ป +โ”‚ โ”œโ”€โ”€ ๐Ÿงช +โ”‚ โ”‚ โ”œโ”€โ”€ all_preds.jsonl +โ”‚ โ”‚ โ”œโ”€โ”€ args.yaml +โ”‚ โ”‚ โ”œโ”€โ”€ *.html (Webpage Files) +โ”‚ โ”‚ โ””โ”€โ”€ *.traj (Trajectories) +โ”‚ โ””โ”€โ”€ ๐Ÿงช +โ”‚ โ”œโ”€โ”€ all_preds.jsonl +โ”‚ โ”œโ”€โ”€ args.yaml +โ”‚ โ”œโ”€โ”€ *.html (Webpage Files) +โ”‚ โ””โ”€โ”€ *.traj (Trajectories) +โ”œโ”€โ”€ ๐Ÿ‘จโ€๐Ÿ’ป +โ”‚ โ”œโ”€โ”€ ๐Ÿงช +โ”‚ โ”‚ โ””โ”€โ”€ ... +โ”‚ โ””โ”€โ”€ ๐Ÿงช +โ”‚ โ””โ”€โ”€ ... +... +``` +Where every experiment follows the pattern `trajectories//`. The `` is automatically inferred from your system, and the `experiment name` is inferred from the arguments of the `run.py`. + +## How an Experiment Folder is Generated + +Each call to `run.py` produces a single `trajectories//` folder containing the following assets: +* `all_preds.jsonl`: A single file containing all of the predictions generated for the experiment (1 prediction per task instance), where each line is formatted as: +``` +{ + "instance_id": "", + "model_patch": "<.patch file content string>", + "model_name_or_path": "", +} +``` +* `args.yaml`: A summary of the configurations for the experiment run. +* `.traj`: A `.json` formatted file containing the (thought, action, observation) turns generated by SWE-agent towards solving ``. +* `.html`: An `.html` single webpage render of the trajectory, which can be directly opened in the browser for easier viewing of the trajectory. + +> โš ๏ธ Notes +> * Evaluation is not completed by `run.py`, it is a separate step. +> * `all_preds.jsonl` can be referenced directly into `evaluation/run_eval.sh` to run evaluation. \ No newline at end of file